SENDAs Agreement 1 Update 2010-2022

First step of deduplication process. Exploratory data analysis was conducted, addressing issues such as data entry errors, missing values, and the conversion of the date of birth into the age at the time of the first discharge for each individual.

Author

Andrés González Santa Cruz

Published

September 27, 2025


Data Loading and Exploration

Loading Packages and uniting databases

Proceed to load the necessary packages.

Code
# invisible("Only run from Ubuntu")
# if (!(Sys.getenv("RSTUDIO_SESSION_TYPE") == "server" || file.exists("/.dockerenv"))) {
#   if(Sys.info()["sysname"]!="Windows"){
#     Sys.setenv(RETICULATE_PYTHON = "/home/fondecytacc/.pyenv/versions/3.11.5/bin/python")
#   }
# }

#clean enviroment
rm(list = ls()); gc()

time_before_dedup1<-Sys.time()

#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
# --- Bootstrap reticulate con ruta relativa a getwd() ---
suppressPackageStartupMessages(library(reticulate))

# Busca .mamba_root/envs/py311/python.exe desde getwd() hacia padres
find_python_rel <- function(start = getwd(),
                            rel = file.path(".mamba_root","envs","py311","python.exe")) {
  cur <- normalizePath(start, winslash = "/", mustWork = FALSE)
  repeat {
    cand <- normalizePath(file.path(cur, rel), winslash = "/", mustWork = FALSE)
    if (file.exists(cand)) return(cand)
    parent <- dirname(cur)
    if (identical(parent, cur)) return(NA_character_)  # llegó a la raíz
    cur <- parent
  }
}

py <- find_python_rel()

if (is.na(py)) {
  stop("No se encontró Python relativo a getwd() (buscando '.mamba_root/envs/py311/python.exe').\n",
       "Directorio actual: ", getwd())
}

# Forzar ese intérprete
Sys.unsetenv(c("RETICULATE_CONDAENV","RETICULATE_PYTHON_FALLBACK"))
Sys.setenv(RETICULATE_PYTHON = py)
reticulate::use_python(py, required=T)

py_config()  # verificación


#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_

# tidy, robust, and commented
load_ndp <- function(date_tag,
                     base_name,
                     input_subdir,
                     out_subdir,
                     load_into    = .GlobalEnv) {

  # Are we in RStudio Server or Docker?
  is_server <- Sys.getenv("RSTUDIO_SESSION_TYPE") == "server" || file.exists("/.dockerenv")

  # Project root = current WD without a trailing "/cons"
  # Safer than gsub everywhere
  wd <- getwd()
  project_root <- sub("(/)?cons/?$", "", wd)

  # Build dirs
  out_dir  <- file.path(project_root, out_subdir)
  in_dir   <- if (is_server) file.path(getwd(), input_subdir) else out_dir

  # Filenames (choose one canonical extension spelling)
  rdata_file   <- sprintf("%s_%s.Rdata", base_name, date_tag)
  seven_z_part <- sprintf("%s_%s.Rdata.7z.001", base_name, date_tag)
  enc_file     <- sprintf("%s_%s.Rdata.enc",  base_name, date_tag)  # only if you actually encrypt to .enc

  # Optional: Windows drive-based Google Drive/E: fallback (only on Windows)
  envpath <- NULL
  if (.Platform$OS.type == "windows") {
    drv <- toupper(substr(normalizePath(project_root, winslash = "\\", mustWork = FALSE), 1, 1))
    envpath <- if (identical(drv, "G")) {
      "G:/Mi unidad/Alvacast/SISTRAT 2023/"
    } else {
      "E:/Mi unidad/Alvacast/SISTRAT 2023/"
    }
  }
  # message("envpath: ", envpath %||% "<none>")

  # Ensure dirs exist (won't error if already present)
  dir.create(out_dir, recursive = TRUE, showWarnings = FALSE)

  # Helper: load Rdata into the specified environment
  load_rdata <- function(path) {
    stopifnot(file.exists(path))
    loaded <- load(path, envir = load_into)
    message("Loaded objects: ", paste(loaded, collapse = ", "))
    invisible(loaded)
  }

  if (!is_server) {
    # Desktop workflow: expect plain .Rdata in out_dir
    local_rdata <- file.path(out_dir, rdata_file)
    if (!file.exists(local_rdata)) {
      stop("Rdata file not found: ", local_rdata)
    }
    return(load_rdata(local_rdata))

  } else {
    # Server/Docker workflow: expect compressed/encrypted parts in _input
    seven_z_path <- file.path(in_dir, seven_z_part)
    enc_path     <- file.path(in_dir, enc_file)
    out_rdata    <- file.path(out_dir, rdata_file)

    if (file.exists(seven_z_path)) {
      # Extract 7z multi-part to out_dir using password
      pass <- Sys.getenv("PASS_PPIO", unset = NA_character_)
      if (is.na(pass) || pass == "") stop("Missing PASS_PPIO env var for 7z decryption.")
      # 7z command: x (extract), -p<pass>, -o<outdir>
      args <- c("x", shQuote(seven_z_path), sprintf("-p%s", pass), paste0("-o", shQuote(out_dir)))
      status <- system2("7z", args = args, stdout = TRUE, stderr = TRUE)
      # Optional: check extraction result
      if (!file.exists(out_rdata)) {
        stop("Extraction finished but target not found: ", out_rdata, "\n7z output:\n", paste(status, collapse = "\n"))
      }
      return(load_rdata(out_rdata))

    } else if (file.exists(enc_path)) {
      # If you truly have a raw .enc, you need a decryption step here (not loadable as-is).
      stop("Found encrypted file but no extractor defined for .enc: ", enc_path)

    } else if (file.exists(out_rdata)) {
      # Already extracted earlier
      return(load_rdata(out_rdata))

    } else {
      stop("No input found in: ", in_dir,
           "\nTried: ", seven_z_path, " and ", enc_path,
           "\nAlso looked for already-extracted: ", out_rdata)
    }
  }
}

load_ndp(date_tag = "2025_09_27", 
                     base_name = "22_ndp",
                     input_subdir = "_input",
                     out_subdir   = file.path("data", "20241015_out"),
                     load_into    = .GlobalEnv)
          used (Mb) gc trigger (Mb) max used (Mb)
Ncells  607621 32.5    1282490 68.5  1164779 62.3
Vcells 1246965  9.6    8388608 64.0  2184548 16.7
python:         G:/My Drive/Alvacast/SISTRAT 2023/.mamba_root/envs/py311/python.exe
libpython:      G:/My Drive/Alvacast/SISTRAT 2023/.mamba_root/envs/py311/python311.dll
pythonhome:     G:/My Drive/Alvacast/SISTRAT 2023/.mamba_root/envs/py311
version:        3.11.5 | packaged by conda-forge | (main, Aug 27 2023, 03:23:48) [MSC v.1936 64 bit (AMD64)]
Architecture:   64bit
numpy:           [NOT FOUND]

NOTE: Python version was forced by RETICULATE_PYTHON
Code
#https://github.com/rstudio/renv/issues/544
#renv falls back to copying rather than symlinking, which is evidently very slow in this configuration.
renv::settings$use.cache(FALSE)

#only use explicit dependencies (in DESCRIPTION)
renv::settings$snapshot.type("implicit")

#check if rstools is installed
try(installr::install.Rtools(check_r_update=F))

Installing package into ‘G:/My Drive/Alvacast/SISTRAT 2023/renv/library/windows/R-4.4/x86_64-w64-mingw32’ (as ‘lib’ is unspecified)

Code
if(!quarto::quarto_version()>="1.7.29"){
stop("You need to install a recent quarto version")   # la publicada el 28-abr-2025
}

#change repository to CL
local({
  r <- getOption("repos")
  r["CRAN"] <- "https://cran.dcc.uchile.cl/"
  options(repos=r)
})

if(!require(pacman)){install.packages("pacman");require(pacman)}

Cargando paquete requerido: pacman

Code
if(!require(pak)){install.packages("pak");require(pak)}

Cargando paquete requerido: pak

Code
pacman::p_unlock(lib.loc = .libPaths()) #para no tener problemas reinstalando paquetes

No 00LOCK detected in: G:/My Drive/Alvacast/SISTRAT 2023/renv/library/windows/R-4.4/x86_64-w64-mingw32 No 00LOCK detected in: C:/Program Files/R/R-4.4.1/library

Code
if(Sys.info()["sysname"]=="Windows"){
if (getRversion() != "4.4.1") { stop("Requires R version 4.4.1; Actual: ", getRversion()) }
}

#check docker
check_docker_running <- function() {
  # Try running 'docker info' to check if Docker is running
  system("docker info", intern = TRUE, ignore.stderr = TRUE)
}

install_docker <- function() {
  # Open the Docker Desktop download page in the browser for installation
  browseURL("https://www.docker.com/products/docker-desktop")
}

# Main logic
if (inherits(try(check_docker_running(), silent = TRUE), "try-error")) {
  liftr::install_docker()
} else {
  message("Docker is running.")
}

Warning in system(“docker info”, intern = TRUE, ignore.stderr = TRUE): el comando ejecutado ‘docker info’ tiene el estatus 1

Docker is running.

Code
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#PACKAGES#######################################################################
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_

unlink("*_cache", recursive=T)

# ----------------------------------------------------------------------
# 2. Use a single pak::pkg_install() call for most CRAN packages
# ----------------------------------------------------------------------

paks <-
  c(#"git", 
    # To connect to github
    "gh", #interface for  GitHub API from R
    #
    "gitcreds", # manages Git credentials (usernames, passwords, tokens)
    #
    "usethis", # simplifies common project setup tasks for R developers
    # Package to bring packages in development
    "devtools",
    # Package administration
    "renv",
    # To manipulate data
    "knitr", "pander", "DT",
    # Join
    "fuzzyjoin",
    # For tables
    "tidyverse", "janitor",
    # For contingency tables
    "kableExtra",
    # For connections with python
    "reticulate",
    # To manipulate big data
    "polars", "sqldf",
    # To bring big databases
    "nanoparquet",
    # Interface for R and RStudio in R
    "installr", "rmarkdown", "quarto", "yaml", #"rstudioapi",
    # Time handling
    "clock",
    # Combine plots
    "ggpubr",
    # Parallelized iterative processing
    "furrr",
    # Work like a tibble with a data.table database
    "tidytable",
    # Split database into training and testing
    "caret",
    # Impute missing data
    "missRanger", "mice",
    # To modularize tasks
    "job",
    # For PhantomJS install checks
    "webshot"
  )

# dplyr
# janitor
# reshape2
# tidytable
# arrow
# boot
# broom
# car
# caret
# data.table
# DiagrammeR
# DiagrammeRsvg
# dplyr
# epiR
# epitools
# ggplot2
# glue
# htmlwidgets
# knitr
# lubridate
# naniar
# parallel
# polycor
# pROC
# psych
# readr
# rio
# rsvg
# scales
# stringr
# tableone
# rmarkdown
# biostat3
# codebook
# finalfit
# Hmisc
# kableExtra
# knitr
# devtools
# tidyr
# stringi
# stringr
# muhaz
# sqldf
# compareGroups
# survminer
# lubridate
# ggfortify
# car
# fuzzyjoin
# compareGroups
# caret
# job
# htmltools
# nanoparquet
# ggpubr
# polars
# installr
# clock
# pander
# reshape
# mice
# missRanger
# VIM
# withr
# biostat3
# broom
# glue
# finalfit
# purrr
# sf


# pak::pkg_install(paks)

pak::pak_sitrep()
# pak::sysreqs_check_installed(unique(unlist(paks)))
#pak::lockfile_create(unique(unlist(paks)),  "dependencies_duplicates24.lock", dependencies=T)
#pak::lockfile_install("dependencies_duplicates24.lock")
#https://rdrr.io/cran/pak/man/faq.html
#pak::cache_delete()

library(tidytable)

Adjuntando el paquete: ‘tidytable’

The following objects are masked from ‘package:stats’:

dt, filter, lag

The following object is masked from ‘package:base’:

%in%
Code
library(polars)

Warning: package ‘polars’ was built under R version 4.4.3

Code
library(ggplot2)
library(readr)

Adjuntando el paquete: ‘readr’

The following object is masked by ‘.GlobalEnv’:

parse_date
Code
# ----------------------------------------------------------------------
# 3. Activate polars code completion (safe to try even if it fails)
# ----------------------------------------------------------------------
try(polars_code_completion_activate())

Using code completion in ‘native’ mode.

Code
# ----------------------------------------------------------------------
# 4. BPMN from GitHub (not on CRAN, so install via devtools if missing)
# ----------------------------------------------------------------------
if (!requireNamespace("bpmn", quietly = TRUE)) {
  devtools::install_github("bergant/bpmn")
}

# ----------------------------------------------------------------------
# 5. PhantomJS Check (use webshot if PhantomJS is missing)
# ----------------------------------------------------------------------
if (!webshot::is_phantomjs_installed()) {
  webshot::install_phantomjs()
}

#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#FUNCTIONS######################################################################
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_


#WINDOWS do not restrict memory size
if(.Platform$OS.type == "windows") withAutoprint({
  memory.size()
  memory.size(TRUE)
  memory.limit()
})

Warning: ‘memory.size()’ is no longer supported

Warning: ‘memory.size()’ is no longer supported

Warning: ‘memory.limit()’ is no longer supported

Code
memory.limit(size=56000)

Warning: ‘memory.limit()’ is no longer supported

Code
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
# NO MORE DEBUGS
options(error = NULL)        # si antes tenías options(error = recover) o browser)
options(browserNLdisabled = FALSE)

#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
# CORRECT PROBLEMATIC DATES
# 1) Smart parser for messy date strings
parse_date_smart <- function(x) {
    x <- x %>%
        stringr::str_squish() %>%
        stringr::str_replace_all("[^0-9/\\-\\.]", "") %>%   # keep digits and common seps
        na_if("") %>%
        na_if("00000000") %>%
        na_if("00/00/0000") %>%
        na_if("0")
    
    # try ymd, then dmy, then mdy (quietly)
    y <- suppressWarnings(lubridate::ymd(x))
    y <- coalesce(y, suppressWarnings(lubridate::dmy(x)))
    y <- coalesce(y, suppressWarnings(lubridate::mdy(x)))
    as.Date(y)
}

# 2) Birth-date validity (range + age cap)
is_valid_birth <- function(d, ref_date = Sys.Date(),
                           min_year = 1900, max_age = 100) {
    if (is.null(d)) return(rep(FALSE, length(d)))
    d_ok <- !is.na(d) &
        d >= as.Date(sprintf("%d-01-01", min_year)) &
        d <= ref_date &
        (as.integer(lubridate::time_length(lubridate::interval(d, ref_date), "years")) <= max_age)
    d_ok
}

# 3) Generic applier: uses 'row' from the problems table to update df[[target_col]]
apply_date_corrections <- function(df, problems,
                                   target_col = "birth_date",
                                   row_col    = "row",
                                   actual_col = "actual") {
    n <- nrow(df)
    
    corr <- problems %>%
        transmute(
            .row   = !!sym(row_col),
            parsed = parse_date_smart(!!sym(actual_col))
        ) %>%
        filter(.row >= 1, .row <= n) %>%
        mutate(valid = is_valid_birth(parsed)) %>%
        filter(valid)
    
    if (nrow(corr)) {
        df[[target_col]][corr$.row] <- corr$parsed
    }
    df
}
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#NAs are replaced with "" in knitr kable
options(knitr.kable.NA = '')

pander::panderOptions('big.mark', ',')
pander::panderOptions('decimal.mark', '.')

#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#
#to format rows in bold
format_cells <- function(df, rows ,cols, value = c("italics", "bold", "strikethrough")){

  # select the correct markup
  # one * for italics, two ** for bold
  map <- setNames(c("*", "**", "~~"), c("italics", "bold", "strikethrough"))
  markup <- map[value]  

  for (r in rows){
    for(c in cols){

      # Make sure values are not factors
      df[[c]] <- as.character( df[[c]])

      # Update formatting
      df[r, c] <- ifelse(nchar(df[r, c])==0,"",paste0(markup, gsub(" ", "", df[r, c]), markup))
    }
  }

  return(df)
}
#To produce line breaks in messages and warnings
knitr::knit_hooks$set(
   error = function(x, options) {
     paste('\n\n<div class="alert alert-danger" style="font-size: small !important;">',
           gsub('##', '\n', gsub('^##\ Error', '**Error**', x)),
           '</div>', sep = '\n')
   },
   warning = function(x, options) {
     paste('\n\n<div class="alert alert-warning" style="font-size: small !important;">',
           gsub('##', '\n', gsub('^##\ Warning:', '**Warning**', x)),
           '</div>', sep = '\n')
   },
   message = function(x, options) {
     paste('<div class="message" style="font-size: small !important;">',
           gsub('##', '\n', x),
           '</div>', sep = '\n')
   }
)

#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:

sum_dates <- function(x){
  
  cbind.data.frame(
min= as.Date(min(unclass(as.Date(x)), na.rm=T), origin = "1970-01-01"),
p001= as.Date(quantile(unclass(as.Date(x)), .001, na.rm=T), origin = "1970-01-01"),
p005= as.Date(quantile(unclass(as.Date(x)), .005, na.rm=T), origin = "1970-01-01"),
p025= as.Date(quantile(unclass(as.Date(x)), .025, na.rm=T), origin = "1970-01-01"),
p25= as.Date(quantile(unclass(as.Date(x)), .25, na.rm=T), origin = "1970-01-01"),
p50= as.Date(quantile(unclass(as.Date(x)), .5, na.rm=T), origin = "1970-01-01"),
p75= as.Date(quantile(unclass(as.Date(x)), .75, na.rm=T), origin = "1970-01-01"),
p975= as.Date(quantile(unclass(as.Date(x)), .975, na.rm=T), origin = "1970-01-01"),
p995= as.Date(quantile(unclass(as.Date(x)), .995, na.rm=T), origin = "1970-01-01"),
p999= as.Date(quantile(unclass(as.Date(x)), .999, na.rm=T), origin = "1970-01-01"),
max= as.Date(max(unclass(as.Date(x)), na.rm=T), origin = "1970-01-01")
  )
}

#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:

# Define the function adapted for Polars
sum_dates_polars <- function(df, date_col) {
  # Create the list of quantiles
  quantiles <- c(0.001, 0.005, 0.025, 0.25, 0.5, 0.75, 0.975, 0.995, 0.999)
  # Create expressions to calculate min and max
  expr_list <- list(
    pl$col(date_col)$min()$alias("min"),
    pl$col(date_col)$max()$alias("max")
  )
  # Add expressions for quantiles
  for (q in quantiles) {
    expr_list <- append(expr_list, pl$col(date_col)$quantile(q)$alias(paste0("p", sub("\\.", "", as.character(q)))))
  }
  # Apply the expressions and return a DataFrame with the results
  df$select(expr_list)
}

# Custom function for sampling with a seed
sample_n_with_seed <- function(data, size, seed) {
  set.seed(seed)
  dplyr::sample_n(data, size)
}

# Function to get the most frequent value 
most_frequent <- function(x) { 
  uniq_vals <- unique(x)
  freq_vals <- sapply(uniq_vals, function(val) sum(x == val))
  most_freq <- uniq_vals[which(freq_vals == max(freq_vals))]
  
  if (length(most_freq) == 1) {
    return(most_freq)
  } else {
    return(NA)
  }
}
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#CONFIG #######################################################################
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_

options(scipen=2) #display numbers rather scientific number

#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
Error in contrib.url(repos, "source") : 
  trying to use CRAN without setting a mirror
* pak version:
- 0.8.0.1
* Version information:
- pak platform: x86_64-w64-mingw32 (current: x86_64-w64-mingw32, compatible)
- pak repository: - (local install?)
* Optional packages installed:
- pillar
* Library path:
- G:/My Drive/Alvacast/SISTRAT 2023/renv/library/windows/R-4.4/x86_64-w64-mingw32
- C:/Program Files/R/R-4.4.1/library
* pak is installed at G:/My Drive/Alvacast/SISTRAT 2023/renv/library/windows/R-4.4/x86_64-w64-mingw32/pak.
* Dependency versions:
- callr      3.7.6      
- cli        3.6.2      
- curl       5.2.1      
- desc       1.4.3      
- filelock   1.0.3      
- jsonlite   1.8.8      
- lpSolve    5.6.23.9000
- pkgbuild   1.4.4      
- pkgcache   2.2.2.9000 
- pkgdepends 0.7.2.9000 
- pkgsearch  3.1.3.9000 
- processx   3.8.4      
- ps         1.7.6      
- R6         2.5.1      
- zip        2.3.1      
* Dependencies can be loaded
> memory.size()
[1] Inf
> memory.size(TRUE)
[1] Inf
> memory.limit()
[1] Inf
[1] Inf


Note

To assess the main goals of the study, we first focused on distinguishing each user across the yearly datasets obtained from SENDA (1). Next, we separated each user’s treatments (2). Finally, we normalized, standardized, and cleaned each treatment (3). Although these stages may appear conceptually separate and sequential, they are interdependent (e.g., some variables needed to be standardized to identify duplicate entries). Throughout this document, we use the terms “rows”, “cases”, “observations” or “treatment episodes” interchangeably to refer to entries in the dataset.


0. Correct dates

In this section, we address missing or inconsistent dates within the dataset. First, we add a row number to uniquely identify each observation, especially useful for tracking deleted rows. Then, we replace missing birth dates with a default value, recording the manual correction in an observation column (OBS). Next, for missing admission dates, we attempt to replace them with corresponding SENDA dates where available, again updating OBS. Finally, we handle problematic discharge dates by examining rows with missing or unusual values and making targeted corrections. Specifically, we use conditional logic to update discharge dates based on unique hash_key and adm_date combinations, ensuring consistency in treatment timelines. Any remaining issues with discharge date parsing are logged for further review, and prior versions of the discharge date column are removed to maintain consistency.

Code
cat("HASH_KEY for hash_key\n")
colnames(SISTRAT23_c1_2010_2024_df2)[colnames(SISTRAT23_c1_2010_2024_df2) == "HASH_KEY"] <- "hash_key"

cat(paste0("Added row number (rn) to identify deleted observations: ",formatC(nrow(SISTRAT23_c1_2010_2024_df2), big.mark=",")," in total\n"))
SISTRAT23_c1_2010_2024_df2$rn <-1:nrow(SISTRAT23_c1_2010_2024_df2)

#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
cat("check problems with values in each variable of the dataset\n")
colnames_c1oct<-
setdiff(names(SISTRAT23_c1_2010_2024_df2), c("rn", "edad", "birth_date", "hash_key", "codigo_identificacion", "dias_en_tratamiento", "n_meses_en_tratamiento", "edad_inicio_consumo", "edad_inicio_sustancia_principal", "id_centro", "adm_date", "senda_adm_date", "discharge_date", "fecha_ultimo_tratamiento", "fecha_ingreso_a_tratamiento", "fecha_ingreso_a_convenio_senda", "fecha_egreso_de_tratamiento", "n_meses_en_senda", "dias_en_senda"))

unique_values_list_c1_duplicates24 <- setNames(
  lapply(colnames_c1oct, function(col_name) {
    SISTRAT23_c1_2010_2024_df2 |>
      select(all_of(col_name)) |>
      distinct() |>
      pull()
  }),
  colnames_c1oct
)

unique_values_list_c1_duplicates24_df<- list_to_df(unique_values_list_c1_duplicates24)

#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_

invisible("problems with birth, adm or disch date")
#glimpse(slice(SISTRAT23_c1_2010_2022_df,problems_birth_date_c1$row))

invisible("replace missing birth date")

SISTRAT23_c1_2010_2024_df2 <- apply_date_corrections(
    df        = SISTRAT23_c1_2010_2024_df2,
    problems  = problems_birth_date_c1,
    target_col = "birth_date"     # change if your column is named differently
)

invisible("Rest of hashes without valid SENDA ID")
hashes_wrong_senda_id<- (SISTRAT23_c1_2010_2024_df2[problems_birth_date_c1$row,] |> tidytable::filter(grepl("000000", codigo_identificacion)) |> pull(hash_key))
invisible("2025-08-22: didnt find it in other databases")

SISTRAT23_c1_2010_2024_df2 <-
SISTRAT23_c1_2010_2024_df2|>
  tidytable::mutate(
    birth_date = tidytable::case_when(
      hash_key %in% hashes_wrong_senda_id & 
        between(edad, 17, 89) ~ adm_date - lubridate::years(edad),
      TRUE ~ birth_date
    )
  )

#Apply OBSERVATION where date changed
SISTRAT23_c1_2010_2024_df2$OBS <- ifelse(
  SISTRAT23_c1_2010_2024_df2$rn %in% problems_birth_date_c1$row[!is.na(SISTRAT23_c1_2010_2024_df2$birth_date[problems_birth_date_c1$row])],
  "0.a.Corrected birth date manually",
  ""
)

message(paste0("2025-09-27: ", 
               nrow( filter(SISTRAT23_c1_2010_2024_df2, 
                            is.na(birth_date)) ) , "\n"))

2025-09-27: 2

Code
SISTRAT23_c1_2010_2024_df2$edad[!is.na(SISTRAT23_c1_2010_2024_df2$birth_date[problems_birth_date_c1$row])] <- ifelse(
  SISTRAT23_c1_2010_2024_df2$rn %in% problems_adm_date_c1$row,
  "0.b.Replaced birth date manually (w/ SENDA adm date)",
  SISTRAT23_c1_2010_2024_df2$OBS
)

Warning in SISTRAT23_c1_2010_2024_df2\(edad[!is.na(SISTRAT23_c1_2010_2024_df2\)birth_date[problems_birth_date_c1\(row])] <- ifelse(SISTRAT23_c1_2010_2024_df2\)rn %in% : número de elementos para sustituir no es un múltiplo de la longitud del reemplazo

Code
SISTRAT23_c1_2010_2024_df2[problems_birth_date_c1$row,c("codigo_identificacion", "birth_date", "edad")] |> knitr::kable("markdown", caption= "Missing birth date")

#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
cat("replace missing admission date with SENDA admission date\n")
if (is.na(SISTRAT23_c1_2010_2024_df2$adm_date[problems_adm_date_c1$row]))  {
  SISTRAT23_c1_2010_2024_df2$adm_date[problems_adm_date_c1$row] <- SISTRAT23_c1_2010_2024_df2$senda_adm_date[problems_adm_date_c1$row]
}

SISTRAT23_c1_2010_2024_df2$OBS <- ifelse(
  SISTRAT23_c1_2010_2024_df2$rn %in% problems_adm_date_c1$row,
  "0.b.Replaced birth date manually (w/ SENDA adm date)",
  SISTRAT23_c1_2010_2024_df2$OBS
)

SISTRAT23_c1_2010_2024_df2$edad <- 
  as.numeric(SISTRAT23_c1_2010_2024_df2$edad)

Warning: NAs introducidos por coerción

Code
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
cat("try to replace missing discharge dates manually\n")
# cbind.data.frame(rn= problems_discharge_date$row, SISTRAT23_c1_2010_2022_df[problems_discharge_date$row, c("hash_key", "adm_date", "birth_date", "fecha_egresode_tratamiento","diasen_tratamiento","discharge_date")]) %>% View()

# SISTRAT23_c1_2010_2022_df %>%
#     tidytable::mutate(rn=tidytable::row_number()) %>%
#     tidytable::filter(hash_key %in% unique(SISTRAT23_c1_2010_2022_df$hash_key[problems_discharge_date$row])) %>%
#     tidytable::select(rn, hash_key, adm_date, birth_date, fecha_egresode_tratamiento, diasen_tratamiento, discharge_date) %>% View()

#readr::parse_date(discharge_date, format="%d-%m-%Y")
# as.Date("2009-12-30")-as.Date("2009-06-30")
SISTRAT23_c1_2010_2024_df_prev00<-
SISTRAT23_c1_2010_2024_df2 |> 
  tidytable::mutate(disch_date = tidytable::case_when(
    #rn== 1294
    hash_key=="c4795829b6ea9cfc50b988c85deb391fa041d99a0ebca6b68a1378f37e3eb420" & adm_date=="2009-06-30" ~ "2019-12-30",
    #rn== 1934 
    hash_key=="23874d59570adaac6690c85481b869570c10c2f8931fc20636037cdff04af067" & adm_date=="2008-07-02" ~ "2009-05-13",
    #rn== 1938 
    hash_key=="5a16413f76625a09585c89fd3ea4fb05d1ea5cbfbc18247a9fb6e7e21534562d" & adm_date=="2008-07-23" ~ "2009-04-14",
    #rn== 2602 
    hash_key=="11b143acdce4bf1d3a72acd4a703ea8c38543fd02585b4f3b0433e227929ed3c" & adm_date=="2008-03-04" ~ "2009-09-15",
    #rn== 2603 
    hash_key=="986ded00e6ca834805a169ed528655e22f819bf5104d1729b2e1453f20f38065" & adm_date=="2008-12-05" ~ "2009-06-02",
    #rn== 2604 
    hash_key=="d402a1e13f25b2411ca346b0dc84b9fffa45887e628abf09262777b6deae85aa" & adm_date=="2009-06-09" ~ "2009-06-09",
    #rn== 2896
    hash_key=="0d248b372c7224ae2cc1cabb750d6201150175b5d65ec0397ff2127d32b6b675" & adm_date=="2009-02-05" ~ "2009-03-09",
    #rn== 3198
    hash_key== "6eb67e1ead556eb1dbd21951747440057a17a872b33b468a37c9bf781219cef8" & adm_date=="2009-10-07" ~ "2010-04-10",
    #rn== 3260
    hash_key=="e0acff1477306ee93abfca7e251cc6d23db916b390a9fe506fbbefc371ce1d43" & adm_date=="2009-12-07" ~ "2010-06-01",
    #rn== 5175
    hash_key=="eb13b44585501a35df9ce6d262ca6e69e4aa34063af219e19cc95e7609e38cdf" & adm_date=="2010-04-26" ~ "2011-05-03",
    #rn== 5760
    hash_key=="058e8b2c02f98d488a78d78d80435e516c6628cd7edb87ecaf9f8c981d9614ba" & adm_date=="2010-05-03" ~ "2010-10-04",      #rn== 6354
    hash_key=="4d42363412d6a435dd2762bbee7f9b4fe4117ff4c94d55e10472342156238ccb" & adm_date=="2010-06-17" ~ "2010-07-01", 
    #rn== 5760
    hash_key=="058e8b2c02f98d488a78d78d80435e516c6628cd7edb87ecaf9f8c981d9614ba" & adm_date=="2010-05-03" ~ "2010-10-04", 
    #rn== 8176
    hash_key=="228fc5b7b88c5f544f71f9ecfbad4d1750470b717f869a7aa9f01b0169a5d890" & adm_date=="2010-07-01" ~ "2011-01-13", 
    #rn== 8756
    hash_key=="7ebe4155bb7741beef0f30ce47ecbc735bd1f7137d22e81ba21d5f12f8398fa2" & adm_date=="2010-10-04" ~ "2011-01-31", 
    #rn== 5760
    hash_key=="058e8b2c02f98d488a78d78d80435e516c6628cd7edb87ecaf9f8c981d9614ba" & adm_date=="2010-05-03" ~ "2010-10-04", 
    #rn== 9092
    hash_key=="93478aa27b121dbad91cb8e36ef60caa42fce6ca5b99478a77e9b8478df600f3" & adm_date=="2010-11-23" ~ "2011-01-14", 
    #rn== 9171
    hash_key=="6500209f17b52ab4e00a140f7c8f0a10d9b073f81ac9443203f0a1b84c4dc1e8" & adm_date=="2010-11-25" ~ "2011-06-10", 
    #rn== 9177
    hash_key=="4d6e97bfc2aeb15a8c6457ad1c84335de48b5456177b9749159ec2974537634f" & adm_date=="2010-11-25" ~ "2011-06-20", 
    #rn== 9444
    hash_key=="1d5a63a966cea8241228f0057a38ef4e63e0fb353dda174dc95d4393e4cdcefa" & adm_date=="2010-12-02" ~ "2011-06-10", 
    #rn== 10424
    hash_key=="eb13b44585501a35df9ce6d262ca6e69e4aa34063af219e19cc95e7609e38cdf" & adm_date=="2010-04-26" ~ "2011-05-03", 
    #rn== 11482
    hash_key=="228fc5b7b88c5f544f71f9ecfbad4d1750470b717f869a7aa9f01b0169a5d890" & adm_date=="2010-07-01" ~ "2011-01-13",  
    #rn== 12097
    hash_key=="6500209f17b52ab4e00a140f7c8f0a10d9b073f81ac9443203f0a1b84c4dc1e8" & adm_date=="2010-11-25" ~ "2011-06-10",      #rn== 12102
    hash_key=="4d6e97bfc2aeb15a8c6457ad1c84335de48b5456177b9749159ec2974537634f" & adm_date=="2010-11-25" ~ "2011-06-20",      #rn== 12301
    hash_key=="1d5a63a966cea8241228f0057a38ef4e63e0fb353dda174dc95d4393e4cdcefa" & adm_date=="2010-12-02" ~ "2011-06-10",      #rn== 13086
    hash_key=="c75bb8c43963dbad7a1b311497073a58b0e97bb82c5c63a4bc7ae4d1c9014592" & adm_date=="2011-01-13" ~ "2011-07-10",  
    #rn== 13644
    hash_key=="f40999d751e9eb84f5ed6d832d96a1de872599c181e28dd420507c58d7464ccf" & adm_date=="2011-02-08" ~ "2011-08-04", 
    #rn== 14099
    hash_key=="dbe7ddec7591332da15c3c4a1d4a2a1559d455a67b6c31a390ea546ea259c045" & adm_date=="2011-02-10" ~ "2011-05-03", 
    #rn== 14339
    hash_key=="05ff2bf96ef3a294c09b39cf91c19f7a74b080487f13f62c449812f14cefff37" & adm_date=="2011-03-22" ~ "2011-07-31", 
    #rn== 15403
    hash_key=="bdf81829448433489a21d8ac17de96f3765707798d8e2beb7653414f43f272aa" & adm_date=="2011-04-15" ~ "2011-06-12", 
    #rn== 16016
    hash_key=="0bd45263c5217ae4324c23ca4bfec945d4100276fcac4e3e66ad5b6f5341d3fd" & adm_date=="2011-05-20" ~ "2011-06-01", 
    #rn== 16150
    hash_key=="d6d0aaa21c50981871615a6b8886d1f69a3d0f125165f63f6a1c54729be5eea2" & adm_date=="2011-05-23" ~ "2011-06-05",  
    #rn== 16413
    hash_key=="4728851a593a1490d73682e45945fe0f253d0f18dfc12aa1d2d21deef206c39c" & adm_date=="2011-04-18" ~ "2011-08-30", 
    #rn== 16742
    hash_key=="caafb47faaab3c9637821a50ce4dcef33b8e3a9fc275f0ef76f0c93681eb15ba" & adm_date=="2011-06-06" ~ "2011-07-04", 
    #rn== 16745
    hash_key=="18096679bef8db59dbd0ca3be91fa36d7d9dcbbf06b85be2662f410d0146d1a2" & adm_date=="2011-06-17" ~ "2011-07-31", 
    #rn== 16755
    hash_key=="40d3ff594c6c3ddd96e37e5e53fbd22030916a99a4f04cf6283ad188058f2a5b" & adm_date=="2011-06-23" ~ "2011-07-07", 
    #rn== 17500
    hash_key=="667766680894eb203756044682c8445365bb0a831012ec49341b080390133d5d" & adm_date=="2011-06-20" ~ "2011-08-02", 
    #rn== 30449
    hash_key=="60e3066c438a10246353d3a3bce07a58fbfda39465aa84debd48cede21319a94" & adm_date=="2012-10-16" ~ "2013-08-13", 
    #rn== 34193
    hash_key=="60e3066c438a10246353d3a3bce07a58fbfda39465aa84debd48cede21319a94" & adm_date=="2012-10-16" ~ "2013-08-13", 
    #rn== 35638
    hash_key=="08a5dc9a016c0525d7ceea954a8078391701ea9743b71bc2a012f0949952029f" & adm_date=="2013-01-07" ~ "2013-07-17", 
    #rn== 36161
    hash_key=="71049ebb5d958e0647c01c4398c91ff3e02275f7dc5e2fefee5bc263a7653c96" & adm_date=="2013-01-28" ~ "2013-08-12", 
    #rn== 36415
    hash_key=="52e218f6406835e8624ffe71595152560ec44a02a7580d673019eefa88df7a61" & adm_date=="2013-01-29" ~ "2013-04-02", 
    #rn== 37116
    hash_key=="22c282462adfb8e48b3a6b697d533244c9c656a6b31ff87d0180679d9f5ce98d" & adm_date=="2013-02-08" ~ "2013-08-02", 
    #rn== 37958
    hash_key=="221d71ae6c4dba4aee931b3ee518d47fd3972fed3fbf7f4d44c676bedca786c4" & adm_date=="2013-03-18" ~ "2013-07-10", 
    #rn== 38907
    hash_key=="877ea9b68dde038d9f63d04d4e65d1eb27ac3f46af22e310c7c2114feb7f871b" & adm_date=="2013-04-18" ~ "2013-07-31", 
    #rn== 38908
    hash_key=="14af0ddf318fb49877b16491b0fb7df491d98bd32dd854bdbec526f898dd9946" & adm_date=="2013-04-18" ~ "2013-06-17", 
    #rn== 38909
    hash_key=="243a1044f746ae87432532552b4b93b6978fb3b18fa3a4305a11b2af698eb013" & adm_date=="2013-04-16" ~ "2013-07-27", 
    #rn== 39617
    hash_key=="0e729e637c95d5d4486a7f822d14f0f1925ac358fff61d9bba9d7407b8e9abe7" & adm_date=="2013-04-29" ~ "2013-07-25", 
    #rn== 39618
    hash_key=="289a7b6c884980dc60c9171bb05939bacf18a62551ebda723af75cbfc8308db9" & adm_date=="2013-05-08" ~ "2013-07-14", 
    #rn== 39620
    hash_key=="cde086d548022a94e623bfc3d6b34202b28141ed2134ba35425ce4807e75f2fb" & adm_date=="2013-04-29" ~ "2013-07-02", 
    #rn== 40045
    hash_key=="10fc40384411161967b222bf530a0378e0ae585bd69370d57d9c4fb49a1a34c3" & adm_date=="2013-05-22" ~ "2013-08-02",  
    #rn== 40293
    hash_key=="67353760ae53ad8963176af0ec6cab9c4bdad13b9e53058e68e53f80b409b224" & adm_date=="2013-05-29" ~ "2013-08-07",
    #rn== 40599
    hash_key=="3ce639d4d0330242d1f7c1e6496e834ad3fa2b41bef89b09bc373e9dede8c981" & adm_date=="2013-05-02" ~ "2013-07-03",
    #rn== 41114
    hash_key=="5e6d9dcec9e717d4536f7cfa5cc0f713e7c2c7933058aeb9a37fec0a24da5151" & adm_date=="2013-06-06" ~ "2013-07-31",
    #rn== 41117
    hash_key=="e01e3218ba73e9d26178e7a6aceb86357695bc88117f1d7b89c8adbf55210528" & adm_date=="2013-06-05" ~ "2013-06-27",
    #rn== 42456
    hash_key=="421abbc2c85687aa87adec1c3146debf5ddea3ea71f65d708c2cf4d4dde86e38" & adm_date=="2013-07-02" ~ "2013-07-08",
    #rn== 42633
    hash_key=="567f1fd735550a9bc1a2ea8a838d87b69369caa106c2d0cd0a1b38581d09919f" & adm_date=="2013-07-09" ~ "2013-08-16",
    #rn== 42634
    hash_key=="7f259b5289b209cc669db813abfcd14519a21c4f69aaeb0190f094c61a52afad" & adm_date=="2013-06-28" ~ "2013-07-09", 
    #rn== 42854
    hash_key=="49cca05a51baac5c836a053eac96674c775e2d7164209a04f09f8da34952b789" & adm_date=="2013-07-02" ~ "2013-08-02",
    #rn== 43076
    hash_key=="6adbbaff91e32138777abcf66a161d953722255c88368f9a5877d1ddfa48decd" & adm_date=="2013-08-06" ~ "2013-08-20",
    #rn== 43181
    hash_key=="02c866ee44e5a3a310cf18728753e3a4c3751d4ea4d61edc22d78606cde0fcc8" & adm_date=="2013-08-01" ~ "2013-08-16",
    #rn== 43182
    hash_key=="506be60207917af56fa39175f11ee5b3b874c0883245e37d0b2a79e0b24f08ad" & adm_date=="2013-08-01" ~ "2013-08-22",
    TRUE ~ as.character(discharge_date)
  )) |> 
  tidytable::mutate(disch_date= readr::parse_date(disch_date, format="%Y-%m-%d"))|> 
  tidytable::mutate(OBS = tidytable::case_when(  
    hash_key == "c4795829b6ea9cfc50b988c85deb391fa041d99a0ebca6b68a1378f37e3eb420" & adm_date == "2009-06-30" ~ glue::glue("{OBS}; 0.c.Modify discharge date"),
    hash_key == "23874d59570adaac6690c85481b869570c10c2f8931fc20636037cdff04af067" & adm_date == "2008-07-02" ~ glue::glue("{OBS}; 0.c.Modify discharge date"),
    hash_key == "5a16413f76625a09585c89fd3ea4fb05d1ea5cbfbc18247a9fb6e7e21534562d" & adm_date == "2008-07-23" ~ glue::glue("{OBS}; 0.c.Modify discharge date"),
    hash_key == "11b143acdce4bf1d3a72acd4a703ea8c38543fd02585b4f3b0433e227929ed3c" & adm_date == "2008-03-04" ~ glue::glue("{OBS}; 0.c.Modify discharge date"),
    hash_key == "986ded00e6ca834805a169ed528655e22f819bf5104d1729b2e1453f20f38065" & adm_date == "2008-12-05" ~ glue::glue("{OBS}; 0.c.Modify discharge date"),
    hash_key == "d402a1e13f25b2411ca346b0dc84b9fffa45887e628abf09262777b6deae85aa" & adm_date == "2009-06-09" ~ glue::glue("{OBS}; 0.c.Modify discharge date"),
    hash_key == "0d248b372c7224ae2cc1cabb750d6201150175b5d65ec0397ff2127d32b6b675" & adm_date == "2009-02-05" ~ glue::glue("{OBS}; 0.c.Modify discharge date"),
    hash_key == "6eb67e1ead556eb1dbd21951747440057a17a872b33b468a37c9bf781219cef8" & adm_date == "2009-10-07" ~ glue::glue("{OBS}; 0.c.Modify discharge date"),
    hash_key == "e0acff1477306ee93abfca7e251cc6d23db916b390a9fe506fbbefc371ce1d43" & adm_date == "2009-12-07" ~ glue::glue("{OBS}; 0.c.Modify discharge date"),
    hash_key == "eb13b44585501a35df9ce6d262ca6e69e4aa34063af219e19cc95e7609e38cdf" & adm_date == "2010-04-26" ~ glue::glue("{OBS}; 0.c.Modify discharge date"),
    hash_key == "058e8b2c02f98d488a78d78d80435e516c6628cd7edb87ecaf9f8c981d9614ba" & adm_date == "2010-05-03" ~ glue::glue("{OBS}; 0.c.Modify discharge date"),
    hash_key == "4d42363412d6a435dd2762bbee7f9b4fe4117ff4c94d55e10472342156238ccb" & adm_date == "2010-06-17" ~ glue::glue("{OBS}; 0.c.Modify discharge date"),
    hash_key == "228fc5b7b88c5f544f71f9ecfbad4d1750470b717f869a7aa9f01b0169a5d890" & adm_date == "2010-07-01" ~ glue::glue("{OBS}; 0.c.Modify discharge date"),
    hash_key == "7ebe4155bb7741beef0f30ce47ecbc735bd1f7137d22e81ba21d5f12f8398fa2" & adm_date == "2010-10-04" ~ glue::glue("{OBS}; 0.c.Modify discharge date"),
    hash_key == "93478aa27b121dbad91cb8e36ef60caa42fce6ca5b99478a77e9b8478df600f3" & adm_date == "2010-11-23" ~ glue::glue("{OBS}; 0.c.Modify discharge date"),
    hash_key == "6500209f17b52ab4e00a140f7c8f0a10d9b073f81ac9443203f0a1b84c4dc1e8" & adm_date == "2010-11-25" ~ glue::glue("{OBS}; 0.c.Modify discharge date"),
    hash_key == "4d6e97bfc2aeb15a8c6457ad1c84335de48b5456177b9749159ec2974537634f" & adm_date == "2010-11-25" ~ glue::glue("{OBS}; 0.c.Modify discharge date"),
    hash_key == "1d5a63a966cea8241228f0057a38ef4e63e0fb353dda174dc95d4393e4cdcefa" & adm_date == "2010-12-02" ~ glue::glue("{OBS}; 0.c.Modify discharge date"),
    hash_key == "c75bb8c43963dbad7a1b311497073a58b0e97bb82c5c63a4bc7ae4d1c9014592" & adm_date == "2011-01-13" ~ glue::glue("{OBS}; 0.c.Modify discharge date"),
    hash_key == "f40999d751e9eb84f5ed6d832d96a1de872599c181e28dd420507c58d7464ccf" & adm_date == "2011-02-08" ~ glue::glue("{OBS}; 0.c.Modify discharge date"),
    hash_key == "dbe7ddec7591332da15c3c4a1d4a2a1559d455a67b6c31a390ea546ea259c045" & adm_date == "2011-02-10" ~ glue::glue("{OBS}; 0.c.Modify discharge date"),
    hash_key == "05ff2bf96ef3a294c09b39cf91c19f7a74b080487f13f62c449812f14cefff37" & adm_date == "2011-03-22" ~ glue::glue("{OBS}; 0.c.Modify discharge date"),
    hash_key == "bdf81829448433489a21d8ac17de96f3765707798d8e2beb7653414f43f272aa" & adm_date == "2011-04-15" ~ glue::glue("{OBS}; 0.c.Modify discharge date"),
    hash_key == "0bd45263c5217ae4324c23ca4bfec945d4100276fcac4e3e66ad5b6f5341d3fd" & adm_date == "2011-05-20" ~ glue::glue("{OBS}; 0.c.Modify discharge date"),
    hash_key == "d6d0aaa21c50981871615a6b8886d1f69a3d0f125165f63f6a1c54729be5eea2" & adm_date == "2011-05-23" ~ glue::glue("{OBS}; 0.c.Modify discharge date"),
    hash_key == "4728851a593a1490d73682e45945fe0f253d0f18dfc12aa1d2d21deef206c39c" & adm_date == "2011-04-18" ~ glue::glue("{OBS}; 0.c.Modify discharge date"),
    hash_key == "caafb47faaab3c9637821a50ce4dcef33b8e3a9fc275f0ef76f0c93681eb15ba" & adm_date == "2011-06-06" ~ glue::glue("{OBS}; 0.c.Modify discharge date"),
    hash_key == "18096679bef8db59dbd0ca3be91fa36d7d9dcbbf06b85be2662f410d0146d1a2" & adm_date == "2011-06-17" ~ glue::glue("{OBS}; 0.c.Modify discharge date"),
    hash_key == "40d3ff594c6c3ddd96e37e5e53fbd22030916a99a4f04cf6283ad188058f2a5b" & adm_date == "2011-06-23" ~ glue::glue("{OBS}; 0.c.Modify discharge date"),
    hash_key == "667766680894eb203756044682c8445365bb0a831012ec49341b080390133d5d" & adm_date == "2011-06-20" ~ glue::glue("{OBS}; 0.c.Modify discharge date"),
    hash_key == "60e3066c438a10246353d3a3bce07a58fbfda39465aa84debd48cede21319a94" & adm_date == "2012-10-16" ~ glue::glue("{OBS}; 0.c.Modify discharge date"),
    hash_key == "08a5dc9a016c0525d7ceea954a8078391701ea9743b71bc2a012f0949952029f" & adm_date == "2013-01-07" ~ glue::glue("{OBS}; 0.c.Modify discharge date"),
    hash_key == "71049ebb5d958e0647c01c4398c91ff3e02275f7dc5e2fefee5bc263a7653c96" & adm_date == "2013-01-28" ~ glue::glue("{OBS}; 0.c.Modify discharge date"),
    hash_key == "52e218f6406835e8624ffe71595152560ec44a02a7580d673019eefa88df7a61" & adm_date == "2013-01-29" ~ glue::glue("{OBS}; 0.c.Modify discharge date"),
    hash_key == "22c282462adfb8e48b3a6b697d533244c9c656a6b31ff87d0180679d9f5ce98d" & adm_date == "2013-02-08" ~ glue::glue("{OBS}; 0.c.Modify discharge date"),
    hash_key == "221d71ae6c4dba4aee931b3ee518d47fd3972fed3fbf7f4d44c676bedca786c4" & adm_date == "2013-03-18" ~ glue::glue("{OBS}; 0.c.Modify discharge date"),
    hash_key == "877ea9b68dde038d9f63d04d4e65d1eb27ac3f46af22e310c7c2114feb7f871b" & adm_date == "2013-04-18" ~ glue::glue("{OBS}; 0.c.Modify discharge date"),
    hash_key == "14af0ddf318fb49877b16491b0fb7df491d98bd32dd854bdbec526f898dd9946" & adm_date == "2013-04-18" ~ glue::glue("{OBS}; 0.c.Modify discharge date"),
    hash_key == "243a1044f746ae87432532552b4b93b6978fb3b18fa3a4305a11b2af698eb013" & adm_date == "2013-04-16" ~ glue::glue("{OBS}; 0.c.Modify discharge date"),
    hash_key == "0e729e637c95d5d4486a7f822d14f0f1925ac358fff61d9bba9d7407b8e9abe7" & adm_date == "2013-04-29" ~ glue::glue("{OBS}; 0.c.Modify discharge date"),
    hash_key == "289a7b6c884980dc60c9171bb05939bacf18a62551ebda723af75cbfc8308db9" & adm_date == "2013-05-08" ~ glue::glue("{OBS}; 0.c.Modify discharge date"),
    hash_key == "cde086d548022a94e623bfc3d6b34202b28141ed2134ba35425ce4807e75f2fb" & adm_date == "2013-04-29" ~ glue::glue("{OBS}; 0.c.Modify discharge date"),
    hash_key == "10fc40384411161967b222bf530a0378e0ae585bd69370d57d9c4fb49a1a34c3" & adm_date == "2013-05-22" ~ glue::glue("{OBS}; 0.c.Modify discharge date"),
    hash_key == "67353760ae53ad8963176af0ec6cab9c4bdad13b9e53058e68e53f80b409b224" & adm_date == "2013-05-29" ~ glue::glue("{OBS}; 0.c.Modify discharge date"),
    hash_key == "3ce639d4d0330242d1f7c1e6496e834ad3fa2b41bef89b09bc373e9dede8c981" & adm_date == "2013-05-02" ~ glue::glue("{OBS}; 0.c.Modify discharge date"),
    hash_key == "5e6d9dcec9e717d4536f7cfa5cc0f713e7c2c7933058aeb9a37fec0a24da5151" & adm_date == "2013-06-06" ~ glue::glue("{OBS}; 0.c.Modify discharge date"),
    hash_key == "e01e3218ba73e9d26178e7a6aceb86357695bc88117f1d7b89c8adbf55210528" & adm_date == "2013-06-05" ~ glue::glue("{OBS}; 0.c.Modify discharge date"),
    hash_key == "421abbc2c85687aa87adec1c3146debf5ddea3ea71f65d708c2cf4d4dde86e38" & adm_date == "2013-07-02" ~ glue::glue("{OBS}; 0.c.Modify discharge date"),
    hash_key == "567f1fd735550a9bc1a2ea8a838d87b69369caa106c2d0cd0a1b38581d09919f" & adm_date == "2013-07-09" ~ glue::glue("{OBS}; 0.c.Modify discharge date"),
    hash_key == "7f259b5289b209cc669db813abfcd14519a21c4f69aaeb0190f094c61a52afad" & adm_date == "2013-06-28" ~ glue::glue("{OBS}; 0.c.Modify discharge date"),
    hash_key == "49cca05a51baac5c836a053eac96674c775e2d7164209a04f09f8da34952b789" & adm_date == "2013-07-02" ~ glue::glue("{OBS}; 0.c.Modify discharge date"),
    hash_key == "6adbbaff91e32138777abcf66a161d953722255c88368f9a5877d1ddfa48decd" & adm_date == "2013-08-06" ~ glue::glue("{OBS}; 0.c.Modify discharge date"),
    hash_key == "02c866ee44e5a3a310cf18728753e3a4c3751d4ea4d61edc22d78606cde0fcc8" & adm_date == "2013-08-01" ~ glue::glue("{OBS}; 0.c.Modify discharge date"),
    hash_key == "506be60207917af56fa39175f11ee5b3b874c0883245e37d0b2a79e0b24f08ad" & adm_date == "2013-08-01" ~ glue::glue("{OBS}; 0.c.Modify discharge date"),
    TRUE ~ OBS))|>
  tidytable::mutate(OBS= tidytable::case_when(
      hash_key %in% hashes_wrong_senda_id & 
        between(edad, 17, 89) ~ glue::glue("{OBS}; 0.a*.Corrected birth date manually, but only using integer age"), TRUE ~ OBS))
#40d3ff594c6c3ddd96e37e5e53fbd22030916a99a4f04cf6283ad188058f2a5b
#2019-07-01
#60e3066c438a10246353d3a3bce07a58fbfda39465aa84debd48cede21319a94
#2019-07-01
#fecha_egresode_tratamiento  discharge_date 
#
invisible("store problematic parsing")
problems_disch_date <- readr::problems(SISTRAT23_c1_2010_2024_df_prev00$disch_date)

invisible("Eliminate column with previous transformations of discharge date")
SISTRAT23_c1_2010_2024_df_prev00$discharge_date<- NULL

if(nrow(problems_disch_date) >0){message("There are still problems with the discharge date")}

invisible("Eliminate spaces and ; sign if starts with one")
SISTRAT23_c1_2010_2024_df_prev00$OBS <- sub("^;\\s*", "", SISTRAT23_c1_2010_2024_df_prev00$OBS)
HASH_KEY for hash_key
Added row number (rn) to identify deleted observations: 259,395 in total
check problems with values in each variable of the dataset
Missing birth date
codigo_identificacion birth_date edad
RONA12/2/1946 1946-02-02
LECA100000000
DAES219940513 1994-05-13
ALFI100000000 2025
replace missing admission date with SENDA admission date
try to replace missing discharge dates manually

The HASH key 60e3066c438a10246353d3a3bce07a58fbfda39465aa84debd48cede21319a94 has two observations with the same admission date but a missing fecha_egreso_de_tratamiento. However, the observation with a more recent retrieval date includes a discharge date. We kept this information as the valid one.


1. Drop duplicated entries

Many treatments span more than one year, meaning entries from different yearly datasets may correspond to the same treatment. We detected duplicate rows across nearly every variable, with the exception of the entry’s row number in the consolidated dataset and the dataset’s retrieval year.

Code
#create vector with variable names /wo noninformative ones
names_c1 <- setdiff(names(SISTRAT23_c1_2010_2024_df_prev00), c("codigo_identificacion","TABLE", "TABLE_rec", "fecha_egreso_de_tratamiento", "motivo_de_egreso.x", "tipo_centro_derivacion.x", "rn"))
#, "disch_date"

#Group by duplicated rows 
tidytable::as_tidytable(SISTRAT23_c1_2010_2024_df_prev00)[, perfect_dup := .N, by = names_c1] %>%
  assign(x="SISTRAT23_c1_2010_2024_df_prev1",.,envir = .GlobalEnv)

#summarise duplicates and times
SISTRAT23_c1_2010_2024_df_prev1%>%
  tidytable::arrange(hash_key, adm_date, desc(TABLE_rec)) %>%
  tidytable::group_by(perfect_dup) %>%
  tidytable::summarise(n=n()) %>%
  tidytable::rename("Times in dataset"= perfect_dup, "Number of rows"=`n`) %>%
  as.data.frame() %>% #pander needs this
  pander::pander(style = 'rmarkdown', split.tables = Inf, caption = "Table 1. Duplicated cases in almost every variable (excluding TABLE, SENDA ID and row number)")
# 1                  1           104454
# 2                  2            98114
# 3                  3            16947
# 4                  4             2884
# 5                  5              470
# 6                  6              156
# 7                  7               28
# 8                  8                8

#- 2025-Aug
# | Times in dataset | Number of rows |
# |:----------------:|:--------------:|
# |        1         |    128,229     |
# |        2         |    110,676     |
# |        3         |     16,944     |
# |        4         |     2,884      |
# |        5         |      470       |
# |        6         |      156       |
# |        7         |       28       |
# |        8         |       8        |
Table 1. Duplicated cases in almost every variable (excluding TABLE, SENDA ID and row number)
Times in dataset Number of rows
1 186,587
2 66,488
3 5,676
4 544
5 100
Code
tidytable::as_tidytable(SISTRAT23_c1_2010_2024_df_prev1)|> 
  #order by retrieval date of each observation
  tidytable::arrange(desc(TABLE_rec))|>
  #add a column with details of the process of cleaning and formatting
  tidytable::mutate(OBS=case_when(perfect_dup>1 ~ "1.1. Duplicated Cases in Almost Every Variable", TRUE ~ ""))|> 
  #select observations which are different in al relevant data
  tidytable::distinct(all_of(names_c1), .keep_all = TRUE)|>
  #order data by hash key, admission date and database of yearly retrieval dates.
  tidytable::arrange(hash_key, adm_date, desc(TABLE), desc(disch_date))|>
      (\(df) {
        (message(paste0("Deduplicated in almost every variable, Entries: ", nrow(df))))
        (message(paste0("Deduplicated in almost every variable, RUNs: ", tidytable::distinct(df, hash_key)|> nrow())))
        df
    })()|>
    # Assign to global environment
    assign(x="SISTRAT23_c1_2010_2024_df_prev1b", ., envir = .GlobalEnv)

Deduplicated in almost every variable, Entries: 221879

Deduplicated in almost every variable, RUNs: 121464

Code
#2025-08-13
#Deduplicated in almost every variable, Entries: 190061
#Deduplicated in almost every variable, RUNs: 121464

Since these duplicated rows contain identical values, no additional information is lost by deleting them. Thus, we selected only 221,879 rows from the original 259,511 observations.

We still needed to identify which of the repeated treatments contained the most recent information for each specific treatment. Therefore, we removed duplicated rows but retained entries with the greater amount of days of treatment or from a more recent yearly dataset. However, handling cases with negative days of treatment presented a challenge. Table below displays HASH keys with entries showing negative days of treatment. In these cases, it was essential to clarify some dates to prevent overlap between treatments. As shown in this Table, some entries can be replaced with events having similar dates, while others were marked as missing and will be imputed once the dataset is normalized. .

Code
# Inputs:
# - SISTRAT23_c1_2010_2024_df_prev1b  (has columns: id, hash_key, and names_c1)
# - names_c1: character vector of the 103 columns to compare
# Coerce comparison columns to character once (so NA==NA can be treated as equal)

# columns to compare (exclude identifiers just in case)
cols   <- setdiff(names_c1, c("hash_key", "rn"))
n_cols <- length(cols)

# keep only what we need & coerce to character once
df2 <- SISTRAT23_c1_2010_2024_df_prev1b|>
  tidytable::select(rn, hash_key, all_of(cols))|>
  tidytable::mutate(across(all_of(cols), as.character))|>
  tidytable::group_by(hash_key)|>
  tidytable::filter(n() > 1)|> # only groups with >1 row
  ungroup()

# pair rows within each hash_key, then compute % similarity
similarities_tt <- df2 %>%
  inner_join(df2, by = "hash_key", suffix = c(".x", ".y")) %>%
  filter(rn.x < rn.y) %>%
  mutate(
    matches = pmap_int(
      select(., all_of(paste0(cols, ".x")), all_of(paste0(cols, ".y"))),
      ~ sum(map2_lgl(.x, .y, ~ identical(.x, .y) | (is.na(.x) & is.na(.y))))
    ),
    percentage_similarity = matches / length(cols) * 100
  ) %>%
  select(
    hash_key,
    id1 = rn.x,
    id2 = rn.y,
    matches,
    percentage_similarity
  ) %>%
  arrange(desc(percentage_similarity), desc(matches), id1, id2)

# peek
similarities_tt <- similarities_tt|> tidytable::filter(percentage_similarity>=.8)


melt_ids_tt <- similarities_tt|>
  tidytable::pivot_longer(cols = c(id1, id2), names_to = "which_id", values_to = "rn")|>
  tidytable::arrange(hash_key, which_id)|>
  tidytable::select(rn)|>
  tidytable::distinct()

similar_rows_db<- 
SISTRAT23_c1_2010_2024_df_prev1b|>
  tidytable::filter(rn %in% tidytable::pull(melt_ids_tt))|>
  #tidytable::select(rn, TABLE_rec, hash_key, birth_date, edad, adm_date, senda_adm_date, disch_date, motivo_de_egreso, id_centro, motivo_de_egreso_alta_administrativa, diagnostico_trs_consumo_sustancia, sustancia_principal, otras_sustancias_no1, otras_sustancias_no2, otras_sustancias_no3, diagnostico_trs_psiquiatrico_dsm_iv, diagnostico_trs_psiquiatrico_sub_dsm_iv, diagnostico_trs_psiquiatrico_cie_10, diagnostico_trs_psiquiatrico_sub_cie_10, diagnostico_trs_fisico, otros_problemas_de_atencion_de_salud_mental, compromiso_biopsicosocial, tiene_menores_de_edad_a_cargo)|> 
  tidytable::left_join(tidytable::pivot_longer(similarities_tt,cols = c(id1, id2), names_to = "which_id", values_to = "rn")[,c("rn", "percentage_similarity")], by="rn")

1.b. Most frequent episodes by patients

Before analyzing these matches, we also need to ask ourselves if there is any HASH Key that might not be referring to a single patient. To do this, we examine a table showing the count of occurrences in the database.

Code
SISTRAT23_c1_2010_2024_df_prev1b|> 
  janitor::tabyl(hash_key)|> 
  arrange(desc(n))|> 
  head(10)|>
  data.frame()|>
    mutate(
    hash_key = gsub(
      "(?<=^.{8}).{2}",   # lookbehind for first 8 chars, match the next 2
      "**",               # replace with two asterisks
      hash_key,
      perl = TRUE
    )
  ) |>
  tidytable::mutate(percent=scales::percent(percent, accuracy=0.01))|>
  knitr::kable('markdown', caption = "Table 2. Most frequent HASHs (masked 9th and 10th positions)")
# hash_key  n   percent
# 265141d6**1da3b6b3f24385df58fff31734290e1e156271cec46b48151dd9ab  17  0.01%
# 889f3c40**2d555db3ce0fec762972f1466c4fb772ef02e44ca579cd2dd6e249  15  0.01%
# 9550d492**e6f07553e11bb60f196e9d971081ceea9b200cb8d49bc9f3b8acf5  15  0.01%
# 96cc8140**189b3be9b3e1a35b3b18ebb58f2252b5e91b22a5347d098450f364  15  0.01%
# cac70802**74039c22857ef3b94355cc9aacef0af45d1bc79c5d1ad363a2fba3  15  0.01%
# dd290e90**7acadb698affd402df40d527389e7ec0fd0bee001b41d51d80d7e2  15  0.01%
# 06436b79**d835b3542ad3a09753bac01befad1d48a18047dc4358a6665b87b7  14  0.01%
# 363d31da**45038e8b644d1fe217d8dfff7dbb211d5009509212ddeaf3eb5875  14  0.01%
# 6a265e4b**1613507ab42ecac68f50f3d38dc4026db0c36571c0342785e9e243  14  0.01%
# 9de459eb**62c78daf7c8e853249e0ea65f05776872a789af2d0a26fd70d7330  14  0.01%
Table 2. Most frequent HASHs (masked 9th and 10th positions)
hash_key n percent
265141d6**1da3b6b3f24385df58fff31734290e1e156271cec46b48151dd9ab 20 0.01%
ea6485af**3da9c69ed341b5df6324b210ec3f7151d794c565d76576043473fc 19 0.01%
6a265e4b**1613507ab42ecac68f50f3d38dc4026db0c36571c0342785e9e243 17 0.01%
889f3c40**2d555db3ce0fec762972f1466c4fb772ef02e44ca579cd2dd6e249 17 0.01%
9550d492**e6f07553e11bb60f196e9d971081ceea9b200cb8d49bc9f3b8acf5 17 0.01%
96cc8140**189b3be9b3e1a35b3b18ebb58f2252b5e91b22a5347d098450f364 17 0.01%
9de459eb**62c78daf7c8e853249e0ea65f05776872a789af2d0a26fd70d7330 17 0.01%
dd290e90**7acadb698affd402df40d527389e7ec0fd0bee001b41d51d80d7e2 17 0.01%
23107ad6**5ba01ac654117348f322aedf84ed5ba43c2fe396c6be1b464fa8d0 16 0.01%
87c5e432**442ea0280e7b950408ced8f45c82f09c9148d6da285c3f1ad156b9 16 0.01%

Given the guidance from SENDA professionals, the duplicated rows in Table 4 actually correspond to different official IDs (RUNs). This indicates that distinct HASH values represent unique users, suggesting that HASH is a more reliable identifier than SENDA’s ID. Therefore, we skipped the step of identifying HASH keys and SENDA IDs.


1.c. Focus on Duplicated Cases and Dates of Admission

We needed to differentiate between duplicated cases and unique admissions from 2010 to 2024 to focus on treatment date ranges within each patient.

The first step consist in selecting columns to count for data completeness checks. We identified numeric and categorical columns, but excluding specific columns created in the process (e.g., rn, perfect_dup, OBS and hash_key).

The second step consist in preparing the hash_adm_date dataset of duplicated entries with the same hash_key and adm_date. We also converted adm_date and disch_date to numeric values (adm_date_num and disch_date_num). Additionally, calculating dit as the difference between discharge and admission dates, representing the treatment duration.

The third step were created to identify entries sharing the same admission date for each patient and counting how many entries exist per combination of hash_key and adm_date_num, storing this as ntot_hash_adm, and filter for groups with more than one entry, as these indicate potential duplicates.

In the fourth step, we counted missing values in dit (days in treatment) within each group to track incomplete records. We also counted negative values in dit to flag data inconsistencies (e.g., discharge occurring before admission). Additionally, we added a sequential number for each row within the group (n_ord_hash_adm_date) to maintain the record order. Finally, we counted the unique treatment plans and programs in each group, storing these as ndis_tipo_plan and ndis_tipo_prog.

The fifth step involved calculating data completeness metrics for each entry by checking for missing values across the selected numeric and character columns, recorded in n_col_miss. Similarly, we calculated n_col_empty to count empty columns in character fields (e.g., empty strings). We then ranked entries to prioritize those with more complete data.

The final step involved filling missing dit_trans values with a default duration. This was based on a fixed reference date (“2025-05-28” in numeric form: unclass(as.Date("2025-05-28")) minus adm_date_num, ensuring all rows have consistent values for analysis. In previous years we had “2025-05-28” and “2019-11-13”.

Since some records showed inconsistencies in age, the average age (avg_age) within observations sharing the same hash and admission date was calculated to facilitate subsequent comparisons.

Remember that variables that contain .x at the end, come from databases with corrected information in discharge information (August 2025).

Code
# Select numeric columns
num_columns <- sapply(SISTRAT23_c1_2010_2024_df_prev1b, is.numeric)
num_column_names <- names(SISTRAT23_c1_2010_2024_df_prev1b)[num_columns]
num_column_names <- setdiff(num_column_names, c("rn", "perfect_dup"))
# Select character columns
char_columns <- sapply(SISTRAT23_c1_2010_2024_df_prev1b, is.character)
char_column_names <- names(SISTRAT23_c1_2010_2024_df_prev1b)[char_columns]
char_column_names <- setdiff(char_column_names, c("OBS", "TABLE", "TABLE_rec", "hash_key", "codigo_identificacion"))

invisible("Get a database of observations with the same HASH and admission date")
hash_adm_date <- 
 SISTRAT23_c1_2010_2024_df_prev1b|>
  tidytable::mutate(adm_date_num=as.numeric(as.Date(adm_date)),
    disch_date_num= as.numeric(as.Date(disch_date)), 
    dit= disch_date_num-adm_date_num)|>
  tidytable::mutate(dit_trans= ifelse(is.na(disch_date), unclass(as.Date("2025-05-28"))- adm_date_num, disch_date_num- adm_date_num))|>
  #add a counter of distinct combinations of hash_key and discharge dates
  tidytable::add_count(hash_key, adm_date_num, name = "ntot_hash_adm")|>
  tidytable::filter(ntot_hash_adm>1)|>
  tidytable::group_by(hash_key, adm_date_num)|>
  tidytable::mutate(
    #get the number of distinct centers
    ndis_tipo_plan= tidytable::n_distinct(tipo_de_plan),
    ndis_tipo_prog= tidytable::n_distinct(tipo_de_programa),  
    #if there are different programs or tr. plans, 1, else, 0
    s1cab_plan_prog = tidytable::case_when(ndis_tipo_plan == 1 & ndis_tipo_prog == 1 ~ 0, ndis_tipo_plan > 1 | ndis_tipo_prog > 1 ~ 1, TRUE ~ 0))|>
  #filter days in treatment with missing values
  #tidytable::filter(is.na(dit)) |>
  tidytable::ungroup() |> 
  tidytable::mutate_rowwise(
    n_col_miss = sum(map_lgl(c_across(any_of(num_column_names)), ~ is.na(.x)),na.rm=T) + sum(map_lgl(c_across(where(is.character)), ~ is.na(.x)),na.rm=T),#,
    n_col_empty = sum(sapply(c_across(any_of(char_column_names)), function(x) sum(is.na(x) | nchar(x) < 2)), na.rm = TRUE))|>   
  tidytable::mutate(birth_date_num= as.numeric(birth_date)) |> 
  tidytable::select(rn, TABLE_rec, hash_key, adm_date, adm_date_num, disch_date, tipo_de_plan, tipo_de_programa, id_centro, edad, motivo_de_egreso, ntot_hash_adm, dit, dit_trans, senda, s1cab_plan_prog, ndis_tipo_plan, ndis_tipo_prog, n_col_miss, n_col_empty, birth_date_num, edad_inicio_consumo, edad_inicio_sustancia_principal)
  
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:
invisible("Export database to explore it")

wdpath<-
paste0(gsub("/cons","",gsub("cons","",paste0(getwd(),"/cons"))))
wdpath
rio::export(file=paste0(wdpath,"cons/_out/db1c_hash_adm_date25.xlsx"),hash_adm_date)

#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:
hash_adm_date|>
  data.frame()|>
  tidytable::mutate(adm_year= clock::get_year(adm_date))|>
  tidytable::group_by(hash_key, adm_date)|>
  tidytable::mutate(
    count_miss_disch_date = sum(is.na(disch_date), na.rm=T))|>    
  tidytable::slice(1)|>
  tidytable::ungroup()|>  
  tidytable::group_by(adm_year, ndis_tipo_plan, ndis_tipo_prog, count_miss_disch_date, s1cab_plan_prog)|>
  tidytable::summarise(Freq=n())|>
  tidytable::ungroup()|>  
  tidytable::mutate(count_miss_disch_date = ifelse(as.numeric(count_miss_disch_date)>1, 1, 0))|>
  tidytable::group_by(adm_year, s1cab_plan_prog, count_miss_disch_date)|>
  tidytable::summarise(Freq= sum(Freq, na.rm=T))|>
  tidytable::ungroup()|>  
  tidyr::pivot_wider(
    names_from = c("s1cab_plan_prog", "count_miss_disch_date"),
    values_from = "Freq", 
    names_glue = "plan_prog_{s1cab_plan_prog}_miss_{count_miss_disch_date}",
    values_fill = 0)|>
  knitr::kable(style = 'markdown',
                 col.names= c("Admission year", "Same programs/plans, no miss disch date", "Distinct programs/plans, no miss disch date", "Same programs/plans, miss disch date", "Distinct programs/plans, miss disch date"),
                 caption = paste0("Table 3. Obs. w/ same HASH & admission date, by distinct program/plan (1) & missing values in discharge date(2) (total groups=", tidytable::group_by(hash_adm_date, hash_key, adm_date)|>  tidytable::slice(1)|>nrow(),")"))#8442

#    adm_year plan_prog_0_miss_0 plan_prog_1_miss_0 plan_prog_0_miss_1 plan_prog_1_miss_1
#       <int>              <int>              <int>              <int>              <int>
#  1     2007                  1                  0                  0                  0
#  2     2008                  3                  6                  0                  0
#  3     2009                 15                 13                  0                  0
#  4     2010                 32                 46                  0                  0
#  5     2011                 42                 55                  0                  0
#  6     2012                 32                 47                  0                  0
#  7     2013                 37                 25                  0                  0
#  8     2014                 47                 24                  2                  0
#  9     2015                 22                  8                 20                  0
# 10     2016                  9                  0                 29                  0
# 11     2017                 28                  1                176                  0
# 12     2018                409                  6                889                  9
# 13     2019               6258                 88                 63                  1
# 14     2020                105                  3                  6                  0
# 15     2021                735                 10                 18                  0
# 16     2022               5141                 83                 60                  6
# 17     2023                  1                  0                  0                  0
[1] "G:/My Drive/Alvacast/SISTRAT 2023//"
Table 3. Obs. w/ same HASH & admission date, by distinct program/plan (1) & missing values in discharge date(2) (total groups=42738)
Admission year Same programs/plans, no miss disch date Distinct programs/plans, no miss disch date Same programs/plans, miss disch date Distinct programs/plans, miss disch date
1 0 0 0
2000 1 0 0 0
2001 1 0 0 0
2007 15 0 0 0
2008 35 6 0 0
2009 186 13 0 0
2010 1330 46 3 0
2011 1472 55 2 0
2012 1632 47 0 0
2013 2587 25 1 0
2014 2455 24 4 0
2015 3041 8 41 0
2016 2805 0 39 0
2017 2787 1 195 0
2018 2484 6 880 9
2019 7136 87 64 2
2020 2481 3 75 0
2021 2881 8 206 2
2022 4717 83 63 6
2023 2541 0 146 0

As seen in the table above, admission years between 2023 and 2024 had very few hashes with same admission date and HASH. For the rest of cases, we selected the following criteria for this group of observations, which we have termed “referral careers”: entries with the same admission date but with changes in plan or program and different discharge dates.

Code
invisible("plan_prog_1_miss_1 hay como 200 casos con fecha perdidas")

paste0("Duplicated HASH & adm date\n(n= ",formatC(nrow(hash_adm_date), big.mark=","),"\np= ",formatC(nrow(distinct(hash_adm_date,hash_key,adm_date)), big.mark=","),"\np= ",formatC(length(unique(hash_adm_date$hash_key)), big.mark=","),")")
#[1] "Duplicated HASH & adm date\n(n= 18,155\np= 8,442\np= 8,374)"
#2025 Agosto
#[1] "Duplicated HASH & adm date\n(n= 30,566\np= 14,611\np= 14,194)"


paste0("Duplicated HASH & adm date, changes in plan or program\n(n= ",formatC(nrow(tidytable::filter(hash_adm_date, s1cab_plan_prog==1)), big.mark=","),"\np= ",formatC(nrow(distinct(tidytable::filter(hash_adm_date, s1cab_plan_prog==1),hash_key,adm_date)), big.mark=","),"\np= ",formatC(length(unique(tidytable::filter(hash_adm_date, s1cab_plan_prog==1)$hash_key)), big.mark=","),")")
#[1] "Duplicated HASH & adm date, changes in plan or program\n(n= 698\np= 325\np= 325)"
#2025 Agosto
#[1] "Duplicated HASH & adm date, changes in plan or program\n(n= 916\np= 431\np= 430)"

paste0("Duplicated HASH & adm date, no changes in plan or program\n(n= ",formatC(nrow(tidytable::filter(hash_adm_date, s1cab_plan_prog==0)), big.mark=","),"\np= ",formatC(nrow(distinct(tidytable::filter(hash_adm_date, s1cab_plan_prog==0),hash_key,adm_date)), big.mark=","),"\np= ",formatC(length(unique(tidytable::filter(hash_adm_date, s1cab_plan_prog==0)$hash_key)), big.mark=","),")")
#[1] "Duplicated HASH & adm date, no changes in plan or program\n(n= 17,457\np= 8,117\np= 8,055)"
#2025 Agosto
#[1] "Duplicated HASH & adm date, no changes in plan or program\n(n= 29,650\np= 14,180\np= 13,789)"
[1] "Duplicated HASH & adm date\n(n= 90,511\np= 42,738\np= 38,326)"
[1] "Duplicated HASH & adm date, changes in plan or program\n(n= 1,066\np= 431\np= 430)"
[1] "Duplicated HASH & adm date, no changes in plan or program\n(n= 89,445\np= 42,307\np= 37,990)"

The following flowchart represents a logical decision-making process for handling data entries with multiple observations that share the same unique identifiers, specifically “HASH” and “admission date.” The aim is to systematically evaluate each observation and retain only the most relevant one based on specific criteria.

Code
#https://bergant.github.io/bpmn/
wdpath<-
paste0(gsub("/cons","",gsub("cons","",paste0(getwd(),"/cons"))))
wdpath
envpath<- if(regmatches(wdpath, regexpr("[A-Za-z]+", wdpath))=="G"){"G:/Mi unidad/Alvacast/SISTRAT 2023/"}else{"E:/Mi unidad/Alvacast/SISTRAT 2023/"}
envpath

bpmn::bpmn(paste0(wdpath, "cons/_input/diagram_dup_hash_adm_date.bpmn"))
[1] "G:/My Drive/Alvacast/SISTRAT 2023//"
[1] "G:/Mi unidad/Alvacast/SISTRAT 2023/"

There were numerous internal referrals that did not involve changes to the treatment plan or program characteristics. Additionally, while we were uncertain about prioritizing more days in treatment over the completeness of information (non-missing columns) — criteria that can be further examined in explore_1ca2_lower_dit_but_more_data.R) — we proceeded with this prioritization nonetheless.

Code
invisible("==================================================================")
invisible("Separate according to possible presence of referrals")
hash_adm_date_1ca<-
hash_adm_date |>
  tidytable::filter(s1cab_plan_prog==0) |> 
    #there is variation in age records, so we kept a mean to keep a reference
  tidytable::mutate(avg_age= mean(edad, na.rm=T))|>
  tidytable::mutate(avg_birth_date_num= mean(birth_date_num, na.rm=T))|>
  tidytable::mutate(avg_onset_age= mean(edad_inicio_consumo, na.rm=T))|>
  tidytable::mutate(avg_primary_sub_onset_age= mean(edad_inicio_sustancia_principal, na.rm=T)) |> 
  tidytable::mutate(concat= paste0(hash_key, "_",adm_date))
hash_adm_date_1cb<-
hash_adm_date |>
  tidytable::filter(s1cab_plan_prog==1)|> 
    #there is variation in age records, so we kept a mean to keep a reference
  tidytable::mutate(avg_age= mean(edad, na.rm=T))|>
  tidytable::mutate(avg_birth_date_num= mean(birth_date_num, na.rm=T))|>
  tidytable::mutate(avg_onset_age= mean(edad_inicio_consumo, na.rm=T))|>
  tidytable::mutate(avg_primary_sub_onset_age= mean(edad_inicio_sustancia_principal, na.rm=T)) |> 
  tidytable::mutate(concat= paste0(hash_key, "_",adm_date))
#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:
invisible("==================================================================")
invisible("If the process were truly sequential, shouldn't grouped variables be handled before discarding cases?; we defined objects sequentially")

disc_1ca1<-
hash_adm_date_1ca|>
  tidytable::group_by(hash_key, adm_date_num)|>
  tidytable::mutate(count_neg_dit = sum(dit<0, na.rm=T)) |> 
  tidytable::ungroup() |> 
tidytable::mutate(
  DECISION1= tidytable::case_when(
  count_neg_dit>0 & count_neg_dit<ntot_hash_adm & dit<0~ "1c.a.1.cases w/neg days, removed neg. days[*]",
  count_neg_dit>0 & count_neg_dit<ntot_hash_adm & dit>=0~ "1c.a.1.cases w/neg days, removed neg. days", T~""))|>   
  tidytable::filter(DECISION1 == "1c.a.1.cases w/neg days but w/ positive, removed neg. days[*]")|>
  pull(rn)
kept_1ca1<-
hash_adm_date_1ca|>
  tidytable::group_by(hash_key, adm_date_num)|>
  tidytable::mutate(avg_age= mean(edad, na.rm=T))|>
  tidytable::mutate(count_neg_dit = sum(dit<0, na.rm=T)) |> 
  tidytable::ungroup() |> 
tidytable::mutate(
  DECISION1= tidytable::case_when(
  count_neg_dit>0 & count_neg_dit<ntot_hash_adm & dit<0~ "1c.a.1.cases w/neg days, removed neg. days[*]",
  count_neg_dit>0 & count_neg_dit<ntot_hash_adm & dit>=0~ "1c.a.1.cases w/neg days, removed neg. days", T~""))|>
  tidytable::filter(!DECISION1 == "1c.a.1.cases w/neg days, removed neg. days[*]") 


message(paste0("1.Groups that still have more than one entry: ",kept_1ca1 |>  tidytable::group_by(hash_key, adm_date_num) |> 
    tidytable::summarise(n= n()) |> 
    tidytable::filter(n>1) |> nrow()))

1.Groups that still have more than one entry: 42305

Code
message(paste0("1.Cases that still have more than one entry: ",
tidytable::group_by(kept_1ca1, hash_key, adm_date_num) |> 
    tidytable::summarise(n= n()) |> 
    tidytable::filter(n>1) |> 
  tidytable::ungroup() |> 
  summarise(sum=sum(n))))

1.Cases that still have more than one entry: 89441

Code
#1.Groups that still have more than one entry: 8115
#1.Cases that still have more than one entry: 17453
#Aug 2025
#1.Groups that still have more than one entry: 14178
#1.Cases that still have more than one entry: 29646

disc_1ca2 <-
kept_1ca1 |> 
  tidytable::group_by(hash_key, adm_date_num)|>
  tidytable::mutate(count_miss_dit = sum(is.na(dit), na.rm=T)) |> 
  tidytable::ungroup() |> 
  tidytable::mutate(DECISION2= tidytable::case_when(
  count_miss_dit>0 & count_miss_dit<ntot_hash_adm & is.na(disch_date)~ "1c.a.2.cases w/missing discharge date, removed missed discharge dates[*]",
  count_miss_dit>0 & count_miss_dit<ntot_hash_adm & !is.na(disch_date)~ "1c.a.2.cases w/missing discharge date, removed missed discharge dates", T~"")) |>   
  tidytable::filter(DECISION2 == "1c.a.2.cases w/missing discharge date, removed missed discharge dates[*]") |>
  pull(rn)
kept_1ca2 <-
kept_1ca1 |> 
  tidytable::group_by(hash_key, adm_date_num)|>
  tidytable::mutate(count_miss_dit = sum(is.na(dit), na.rm=T)) |> 
  tidytable::ungroup() |> 
  tidytable::mutate(DECISION2= tidytable::case_when(
  count_miss_dit>0 & count_miss_dit<ntot_hash_adm & is.na(disch_date)~ "1c.a.2.cases w/missing discharge date, removed missed discharge dates[*]",
  count_miss_dit>0 & count_miss_dit<ntot_hash_adm & !is.na(disch_date)~ "1c.a.2.cases w/missing discharge date, removed missed discharge dates", T~"")) |>   
  tidytable::filter(!DECISION2 == "1c.a.2.cases w/missing discharge date, removed missed discharge dates[*]")


message(paste0("2.Groups that still have more than one entry: ",kept_1ca2 |>  tidytable::group_by(hash_key, adm_date_num) |> 
    tidytable::summarise(n= n()) |> 
    tidytable::filter(n>1) |> nrow()))

2.Groups that still have more than one entry: 34556

Code
message(paste0("2.Cases that still have more than one entry: ",
tidytable::group_by(kept_1ca2, hash_key, adm_date_num) |> 
    tidytable::summarise(n= n()) |> 
    tidytable::filter(n>1) |> 
  tidytable::ungroup() |> 
  summarise(sum=sum(n))))

2.Cases that still have more than one entry: 69437

Code
#2.Groups that still have more than one entry: 1079
#2.Cases that still have more than one entry: 2268
#2.Groups that still have more than one entry: 3522
#2.Cases that still have more than one entry: 7105

disc_1ca3 <-
kept_1ca2 |> 
  #get cases w/more than one dit,  but not every row is missing
  tidytable::group_by(hash_key, adm_date_num)|>
  tidytable::mutate(count_miss_dit2 = sum(is.na(dit), na.rm=T),
                    ndis_dit = tidytable::n_distinct(dit),
                    ntot_hash_adm2 = n(),
                    rank_by_dit = min_rank(-dit)) |> 
  tidytable::ungroup() |>   
  tidytable::mutate(DECISION3= tidytable::case_when(  
  ndis_dit>1 & count_miss_dit2<ntot_hash_adm2 & rank_by_dit!=1~ "1c.a.3.cases w/different discharge dates, removed entries w/ lower dit[*]",
  ndis_dit>1 & count_miss_dit2<ntot_hash_adm2 & rank_by_dit==1~ "1c.a.3.cases w/different discharge dates, removed entries w/ lower dit",
  T~""))|>   
  tidytable::filter(DECISION3 == "1c.a.3.cases w/different discharge dates, removed entries w/ lower dit[*]") |> 
  pull(rn)
kept_1ca3 <-
kept_1ca2 |> 
  #get cases w/more than one dit,  but not every row is missing
  tidytable::group_by(hash_key, adm_date_num)|>
  tidytable::mutate(count_miss_dit2 = sum(is.na(dit), na.rm=T),
                    ndis_dit = tidytable::n_distinct(dit),
                    ntot_hash_adm2 = n(),
                    rank_by_dit = min_rank(-dit)) |> 
  tidytable::ungroup() |>   
  tidytable::mutate(DECISION3= tidytable::case_when(  
  ndis_dit>1 & count_miss_dit2<ntot_hash_adm2 & rank_by_dit!=1~ "1c.a.3.cases w/different discharge dates, removed entries w/ lower dit[*]",
  ndis_dit>1 & count_miss_dit2<ntot_hash_adm2 & rank_by_dit==1~ "1c.a.3.cases w/different discharge dates, removed entries w/ lower dit",
  T~""))|>   
  tidytable::filter(!DECISION3 == "1c.a.3.cases w/different discharge dates, removed entries w/ lower dit[*]") 


message(paste0("3.Groups that still have more than one entry: ",
               kept_1ca3 |>  tidytable::group_by(hash_key, adm_date_num) |> 
    tidytable::summarise(n= n()) |> 
    tidytable::filter(n>1) |> nrow()))

3.Groups that still have more than one entry: 34365

Code
message(paste0("3.Cases that still have more than one entry: ",
tidytable::group_by(kept_1ca3, hash_key, adm_date_num) |> 
    tidytable::summarise(n= n()) |> 
    tidytable::filter(n>1) |> 
  tidytable::ungroup() |> 
  summarise(sum=sum(n))))

3.Cases that still have more than one entry: 68987

Code
#3.Groups that still have more than one entry: 849
#3.Cases that still have more than one entry: 1802
#3.Groups that still have more than one entry: 3291
#3.Cases that still have more than one entry: 6636

disc_1ca4 <-
kept_1ca3 |> 
  #filter days in treatment with missing values
  #tidytable::filter(is.na(dit)) |>
  #rank by missing and empty columns
  tidytable::group_by(hash_key, adm_date_num)|>
  tidytable::mutate(
    rank_by_missing = min_rank(n_col_miss),
    rank_by_empty = min_rank(n_col_empty),
    ndis_miss_data = tidytable::n_distinct(n_col_miss),
    ndis_empty_data = tidytable::n_distinct(n_col_empty)) |> 
  tidytable::ungroup() |>   
  tidytable::mutate(DECISION4= tidytable::case_when(  
  (ndis_miss_data>1 | ndis_empty_data>1) & rank_by_missing!=1~ "1c.a.4.cases w/different amount of missing data, removed entries w/ more missingness[*]",
  (ndis_miss_data>1 | ndis_empty_data>1) & rank_by_missing==1~ "1c.a.4.cases w/different amount of missing data, removed entries w/ more missingness",
  T~""))|>   
  tidytable::filter(DECISION4 == "1c.a.4.cases w/different amount of missing data, removed entries w/ more missingness[*]") |>
  pull(rn)
kept_1ca4 <-
kept_1ca3 |> 
  #filter days in treatment with missing values
  #tidytable::filter(is.na(dit)) |>
  #rank by missing and empty columns
  tidytable::group_by(hash_key, adm_date_num)|>
  tidytable::mutate(
    rank_by_missing = min_rank(n_col_miss),
    rank_by_empty = min_rank(n_col_empty),
    ndis_miss_data = tidytable::n_distinct(n_col_miss),
    ndis_empty_data = tidytable::n_distinct(n_col_empty)) |> 
  tidytable::ungroup() |> 
  tidytable::mutate(DECISION4= tidytable::case_when(  
  (ndis_miss_data>1 | ndis_empty_data>1) & rank_by_missing!=1~ "1c.a.4.cases w/different amount of missing data, removed entries w/ more missingness[*]",
  (ndis_miss_data>1 | ndis_empty_data>1) & rank_by_missing==1~ "1c.a.4.cases w/different amount of missing data, removed entries w/ more missingness",
  T~""))|>   
  tidytable::filter(!DECISION4 == "1c.a.4.cases w/different amount of missing data, removed entries w/ more missingness[*]")


message(paste0("4.Groups that still have more than one entry: ",kept_1ca4 |>  tidytable::group_by(hash_key, adm_date_num) |> 
    tidytable::summarise(n= n()) |> 
    tidytable::filter(n>1) |> nrow()))

4.Groups that still have more than one entry: 1056

Code
message(paste0("4.Cases that still have more than one entry: ",
tidytable::group_by(kept_1ca4, hash_key, adm_date_num) |> 
    tidytable::summarise(n= n()) |> 
    tidytable::filter(n>1) |> 
  tidytable::ungroup() |> 
  summarise(sum=sum(n))))

4.Cases that still have more than one entry: 2122

Code
#4.Groups that still have more than one entry: 841
#4.Cases that still have more than one entry: 1785
#Aug 2025
#4.Groups that still have more than one entry: 3282
#4.Cases that still have more than one entry: 6618

disc_1ca5 <-
kept_1ca4 |> 
  tidytable::mutate(TABLE_rec= ifelse(nchar(TABLE_rec) < 5,paste0(TABLE_rec, "0"), as.character(TABLE_rec))) |> 
  tidytable::mutate(TABLE_rec2= readr::parse_number(TABLE_rec)/10)|>
  # Sort by hash and admission date from most recent to oldest, with the retrieval year of the database 
  # in descending order (starting with the most recent) and discharge date in descending order (most recent in the first row).
  tidytable::arrange(hash_key, -adm_date_num, -TABLE_rec2, -disch_date)|>
  tidytable::group_by(hash_key, adm_date_num)|>
  tidytable::mutate(first_miss = row_number() == 1 & is.na(disch_date))|>
  tidytable::mutate(
    cnt_first_miss = sum(first_miss==TRUE, na.rm=T),
    rank_retrieval_yr = min_rank(-TABLE_rec2),
    ndis_TABLE_rec = tidytable::n_distinct(TABLE_rec2))|>
  tidytable::ungroup() |> 
  tidytable::mutate(DECISION5= tidytable::case_when(  
  cnt_first_miss==0 & ndis_TABLE_rec>1 & rank_retrieval_yr!=1~ "1c.a.5.cases w/different retrieval yrs: earlier retrievals had no missing disch.dates, removed entries from previous retrieval yrs[*]",
  cnt_first_miss==0 & ndis_TABLE_rec>1 & rank_retrieval_yr==1~ "1c.a.5.cases w/different retrieval yrs: earlier retrievals had no missing disch.dates, removed entries from previous retrieval yrs",
  T~""))|>   
  tidytable::filter(DECISION5 == "1c.a.5.cases w/different retrieval yrs: earlier retrievals had no missing disch.dates, removed entries from previous retrieval yrs[*]")|> 
  pull(rn)
kept_1ca5 <-
kept_1ca4 |> 
  tidytable::mutate(TABLE_rec= ifelse(nchar(TABLE_rec) < 5,paste0(TABLE_rec, "0"), as.character(TABLE_rec)))|>
  tidytable::mutate(TABLE_rec2= readr::parse_number(TABLE_rec)/10)|>
  # Sort by hash and admission date from most recent to oldest, with the retrieval year of the database 
  # in descending order (starting with the most recent) and discharge date in descending order (most recent in the first row).
  tidytable::arrange(hash_key, -adm_date_num, -TABLE_rec2, -disch_date)|>
  tidytable::group_by(hash_key, adm_date_num)|>
  tidytable::mutate(first_miss = row_number() == 1 & is.na(disch_date))|>
  tidytable::mutate(
    cnt_first_miss = sum(first_miss==TRUE, na.rm=T),
    rank_retrieval_yr = min_rank(-TABLE_rec2),
    ndis_TABLE_rec = tidytable::n_distinct(TABLE_rec2))|>
  tidytable::ungroup() |> 
  tidytable::mutate(DECISION5= tidytable::case_when(  
  cnt_first_miss==0 & ndis_TABLE_rec>1 & rank_retrieval_yr!=1~ "1c.a.5.cases w/different retrieval yrs: earlier retrievals had no missing disch.dates, removed entries from previous retrieval yrs[*]",
  cnt_first_miss==0 & ndis_TABLE_rec>1 & rank_retrieval_yr==1~ "1c.a.5.cases w/different retrieval yrs: earlier retrievals had no missing disch.dates, removed entries from previous retrieval yrs",
  T~""))|>   
  tidytable::filter(!DECISION5 == "1c.a.5.cases w/different retrieval yrs: earlier retrievals had no missing disch.dates, removed entries from previous retrieval yrs[*]")


  #more than one entry grouped, and where the amount of missing dits is different vs. the amount of rows.
  #should be 18 cases
kept_1ca5 |> #
  tidytable::group_by(hash_key, adm_date_num)|>
  tidytable::mutate(count_miss_dit3 = sum(is.na(dit), na.rm=T),
                    ntot_hash_adm3 = n()) |> 
  tidytable::ungroup() |> 
  tidytable::filter(ntot_hash_adm3>1,ntot_hash_adm3!=count_miss_dit3) |> nrow()
#8
invisible("5b657cdf82ab1ca69e7e8f789bf8515d742731412354ea54ea67cf3d189173a4 este se debió haber ido antes, en el 4")
message(paste0("5.Groups that still have more than one entry: ",kept_1ca5 |>  tidytable::group_by(hash_key, adm_date_num) |> 
    tidytable::summarise(n= n()) |> 
    tidytable::filter(n>1) |> nrow()))

5.Groups that still have more than one entry: 95

Code
message(paste0("5.Cases that still have more than one entry: ",
tidytable::group_by(kept_1ca5, hash_key, adm_date_num) |> 
    tidytable::summarise(n= n()) |> 
    tidytable::filter(n>1) |> 
  tidytable::ungroup() |> 
  summarise(sum=sum(n))))

5.Cases that still have more than one entry: 200

Code
# 5.Groups that still have more than one entry: 218
# 5.Cases that still have more than one entry: 538
# Aug 2025
#5.Groups that still have more than one entry: 173
#5.Cases that still have more than one entry: 398

disc_1ca6 <-
kept_1ca5 |> 
  tidytable::group_by(hash_key, adm_date_num)|>
  #count amount missing days in treatment
  #count number of rows
  tidytable::mutate(count_miss_dit3 = sum(is.na(dit), na.rm=T),
                    ntot_hash_adm3 = n(),
                    ndis_TABLE_rec2 = tidytable::n_distinct(TABLE_rec2),
                    ndis_disch_date = tidytable::n_distinct(disch_date),
                    rn_hash_adm= row_number())|> 
  tidytable::ungroup() |> 
  #consider that the database is already ordered
  tidytable::mutate(DECISION6= tidytable::case_when(
  ndis_TABLE_rec2==1 & ndis_disch_date==1 & rn_hash_adm!=1~ "1c.a.6.cases w/ same retrieval yrs and disch. dates, removed entries from previous retrieval yrs[*]",
  ndis_TABLE_rec2==1 & ndis_disch_date==1 & rn_hash_adm==1~ "1c.a.6.cases w/ same retrieval yrs and disch. dates, removed entries from previous retrieval yrs",T~""))|>   
  tidytable::filter(DECISION6 == "1c.a.6.cases w/ same retrieval yrs and disch. dates, removed entries from previous retrieval yrs[*]")|> 
  pull(rn)
kept_1ca6 <-
kept_1ca5 |> 
  tidytable::group_by(hash_key, adm_date_num)|>
  #count amount missing days in treatment
  #count number of rows
  tidytable::mutate(count_miss_dit3 = sum(is.na(dit), na.rm=T),
                    ntot_hash_adm3 = n(),
                    ndis_TABLE_rec2 = tidytable::n_distinct(TABLE_rec2),
                    ndis_disch_date = tidytable::n_distinct(disch_date),
                    rn_hash_adm= row_number())|> 
  tidytable::ungroup() |> 
  #consider that the database is already ordered
  tidytable::mutate(DECISION6= tidytable::case_when(
  ndis_TABLE_rec2==1 & ndis_disch_date==1 & rn_hash_adm!=1~ "1c.a.6.cases w/ same retrieval yrs and disch. dates, removed entries from previous retrieval yrs[*]",
  ndis_TABLE_rec2==1 & ndis_disch_date==1 & rn_hash_adm==1~ "1c.a.6.cases w/ same retrieval yrs and disch. dates, removed entries from previous retrieval yrs",T~""))|>   
  tidytable::filter(!DECISION6 == "1c.a.6.cases w/ same retrieval yrs and disch. dates, removed entries from previous retrieval yrs[*]")

message(paste0("6.Groups that still have more than one entry: ",kept_1ca6 |>  tidytable::group_by(hash_key, adm_date_num) |> 
    tidytable::summarise(n= n()) |> 
    tidytable::filter(n>1) |> nrow()))

6.Groups that still have more than one entry: 92

Code
message(paste0("6.Cases that still have more than one entry: ",
tidytable::group_by(kept_1ca6, hash_key, adm_date_num) |> 
    tidytable::summarise(n= n()) |> 
    tidytable::filter(n>1) |> 
  tidytable::ungroup() |> 
  summarise(sum=sum(n))))

6.Cases that still have more than one entry: 194

Code
# 6.Groups that still have more than one entry: 214
# 6.Cases that still have more than one entry: 530
#Aug 2025
# 6.Groups that still have more than one entry: 169
# 6.Cases that still have more than one entry: 390

disc_1ca7 <-
kept_1ca6 |> 
  tidytable::group_by(hash_key, adm_date_num)|>
  #count amount missing days in treatment
  #count number of rows
  tidytable::mutate(count_miss_dit4 = sum(is.na(dit), na.rm=T),
                    ntot_hash_adm4 = n(),
                    rn_hash_adm2= row_number())|> 
  tidytable::ungroup() |> 
  #consider that the database is already ordered
  tidytable::mutate(DECISION7= tidytable::case_when(
  count_miss_dit4==ntot_hash_adm4 & rn_hash_adm2!=1~ "1c.a.7.cases w/ only missing disch. dates, get the last sorted entry[*]",
  count_miss_dit4==ntot_hash_adm4 & rn_hash_adm2!=1~ "1c.a.7.cases w/ only missing disch. dates, get the last sorted entry",T~""))|>   
  tidytable::filter(DECISION7 == "1c.a.7.cases w/ only missing disch. dates, get the last sorted entry[*]")|> 
  pull(rn)
kept_1ca7 <-
kept_1ca6 |> 
  tidytable::group_by(hash_key, adm_date_num)|>
  #count amount missing days in treatment
  #count number of rows
  tidytable::mutate(count_miss_dit4 = sum(is.na(dit), na.rm=T),
                    ntot_hash_adm4 = n(),
                    rn_hash_adm2= row_number())|> 
  tidytable::ungroup() |> 
  #consider that the database is already ordered
  tidytable::mutate(DECISION7= tidytable::case_when(
  count_miss_dit4==ntot_hash_adm4 & rn_hash_adm2!=1~ "1c.a.7.cases w/ only missing disch. dates, get the last sorted entry[*]",
  count_miss_dit4==ntot_hash_adm4 & rn_hash_adm2!=1~ "1c.a.7.cases w/ only missing disch. dates, get the last sorted entry",T~""))|>   
  tidytable::filter(!DECISION7 == "1c.a.7.cases w/ only missing disch. dates, get the last sorted entry[*]")

message(paste0("7.Groups that still have more than one entry: ",kept_1ca7 |>  tidytable::group_by(hash_key, adm_date_num) |> 
    tidytable::summarise(n= n()) |> 
    tidytable::filter(n>1) |> nrow()))

7.Groups that still have more than one entry: 0

Code
message(paste0("7.Cases that still have more than one entry: ",
tidytable::group_by(kept_1ca7, hash_key, adm_date_num) |> 
    tidytable::summarise(n= n()) |> 
    tidytable::filter(n>1) |> 
  tidytable::ungroup() |> 
  summarise(sum=sum(n))))

7.Cases that still have more than one entry: 0

Code
#7.Groups that still have more than one entry: 0
#7.Cases that still have more than one entry: 0
#7.Groups that still have more than one entry: 0
#7.Cases that still have more than one entry: 0

#8127
invisible("APPLY discarding")  
invisible("What about cases where all observations only have missing discharge dates?")  

invisible("Case with 1 row having more missing values and two with fewer. Keep the row with fewer missing values - ID: 71f48564995f6bfea9f7793c56c40a51d0acc2d80f8822cfcde787597cc0be9e")

invisible("4-row case - ID: 006683df777b093473d92a24bb23ed3516ec0515bf8d91f55d947bda86ec7388")

invisible("Year 2011 - ID: 00bdd3cf594f1df14230d8a31a960997bda563c7fd5f1633ff870fda4c92a59b")

invisible("ID: 00bdd3cf594f1df14230d8a31a960997bda563c7fd5f1633ff870fda4c92a59b - different discharge dates, same plan, program, and center; one entry shows abandonment, the other administrative discharge")
invisible("ID: 01cf3f2d6f609a036f84ac9f735e07b15b3332c424cffe6fe64380866a495838 - same admission date, different centers and discharge dates")
invisible("ID: 01ca660418b3a4434e86270394c58b6850b05867c665ec9332f204f6b6418812 - changes discharge date, but plan and center remain the same")
invisible("ID: 02c866ee44e5a3a310cf18728753e3a4c3751d4ea4d61edc22d78606cde0fcc8 - same admission date and database, everything identical except for discharge date")
invisible("ID: 00bdd3cf594f1df14230d8a31a960997bda563c7fd5f1633ff870fda4c92a59b - same center, plan, and program, but reason and number of days differ")
invisible("If identical database and all but discharge date match, retain the entry with the longest duration")
invisible("ID: cd5ab631d5fc6a47222360f74c30f2bcfea75a517def58a5eaf2cbd507edddcc - database 2012 has more days than 2011, with no changes in plan, program, or center")
[1] 6

From this part of the process, we removed 47138 entries from the database.

Initially, we aimed to apply a stricter selection of cases to retain records that represent a continuous sequence of treatments, including plan changes. However, this approach proved less meaningful, as we eventually intended to group them as a single treatment package. Therefore, we chose to follow the same selection criteria but have noted that these represent treatments involving a change of plan or program. This allows for later disaggregation; for example, if a given continuous treatment period exceeds 1,095 days—which indicates an extended duration that makes the treatment less plausible.

However, under certain conditions in step 3 (after filtering negative and missing dates) we evaluated if, for each group with the same concat value (combination of HASH and admission date), the row with the flag ([*], less days in treatment) had fewer missing values (n_col_miss) and fewer empty values (n_col_empty) than the non-flagged row. Additionally, the row must come from TABLE_rec greater than 2012, and the SENDA column value should not be “No”. If these criteria were met, we kept the row.

After keeping (discarding those that don’t met the criteria), we generated a column named adm_date_rec. If the rows meeting the conditions have been kept, we used the admission date from the row with the highest dit (non-flagged) that matches the disch_date of the row with the lowest dit; if not, retain the original adm_date. Next, generated a column called dit_rec, calculated as the difference in days between disch_date and adm_date_rec. This allowed us to construct differentiated continuous trajectories (treatment carreers).

Code
disc_1cb1<-
hash_adm_date_1cb|>
  tidytable::group_by(hash_key, adm_date_num)|>
  #there is variation in age records, so we kept a mean to keep a reference
  tidytable::mutate(avg_age= mean(edad, na.rm=T))|>
  tidytable::mutate(count_neg_dit = sum(dit<0, na.rm=T)) |> 
  tidytable::ungroup() |> 
tidytable::mutate(
  DECISION1= tidytable::case_when(
  count_neg_dit>0 & count_neg_dit<ntot_hash_adm & dit<0~ "1c.b.1.cases w/neg days, removed neg. days[*]",
  count_neg_dit>0 & count_neg_dit<ntot_hash_adm & dit>=0~ "1c.b.1.cases w/neg days, removed neg. days", T~""))|>   
  tidytable::filter(DECISION1 == "1c.b.1.cases w/neg days but w/ positive, removed neg. days[*]")|>
  pull(rn)
invisible("No cases")
kept_1cb1<-
hash_adm_date_1cb|>
  tidytable::group_by(hash_key, adm_date_num)|>
  tidytable::mutate(avg_age= mean(edad, na.rm=T))|>
  tidytable::mutate(count_neg_dit = sum(dit<0, na.rm=T))|> 
  #add concatenated type of plan and program
  tidytable::mutate(tipode_plan_conc = paste(tipo_de_plan, collapse = ", "))|> 
  tidytable::mutate(tipode_prog_conc = paste(tipo_de_programa, collapse = ", "))|> 
  tidytable::ungroup()|> 
tidytable::mutate(
  DECISION1= tidytable::case_when(
  count_neg_dit>0 & count_neg_dit<ntot_hash_adm & dit<0~ "1c.b.1.cases w/neg days, removed neg. days[*]",
  count_neg_dit>0 & count_neg_dit<ntot_hash_adm & dit>=0~ "1c.b.1.cases w/neg days, removed neg. days", T~""))|>
  tidytable::filter(!DECISION1 == "1c.b.1.cases w/neg days, removed neg. days[*]") 


message(paste0("1.Groups that still have more than one entry: ",kept_1cb1 |>  tidytable::group_by(hash_key, adm_date_num) |> 
    tidytable::summarise(n= n()) |> 
    tidytable::filter(n>1) |> nrow()))

1.Groups that still have more than one entry: 431

Code
message(paste0("1.Cases that still have more than one entry: ",
tidytable::group_by(kept_1cb1, hash_key, adm_date_num) |> 
    tidytable::summarise(n= n()) |> 
    tidytable::filter(n>1) |> 
  tidytable::ungroup() |> 
  summarise(sum=sum(n))))

1.Cases that still have more than one entry: 1066

Code
#1.Groups that still have more than one entry: 431
#1.Cases that still have more than one entry: 916

disc_1cb2 <-
kept_1cb1 |> 
  tidytable::group_by(hash_key, adm_date_num)|>
  tidytable::mutate(count_miss_dit = sum(is.na(dit), na.rm=T)) |> 
  tidytable::ungroup() |> 
  tidytable::mutate(DECISION2= tidytable::case_when(
  count_miss_dit>0 & count_miss_dit<ntot_hash_adm & is.na(disch_date)~ "1c.b.2.cases w/missing discharge date, removed missed discharge dates[*]",
  count_miss_dit>0 & count_miss_dit<ntot_hash_adm & !is.na(disch_date)~ "1c.b.2.cases w/missing discharge date, removed missed discharge dates", T~"")) |>   
  tidytable::filter(DECISION2 == "1c.b.2.cases w/missing discharge date, removed missed discharge dates[*]") |>
  pull(rn)
kept_1cb2 <-
kept_1cb1 |> 
  tidytable::group_by(hash_key, adm_date_num)|>
  tidytable::mutate(count_miss_dit = sum(is.na(dit), na.rm=T)) |> 
  tidytable::ungroup() |> 
  tidytable::mutate(DECISION2= tidytable::case_when(
  count_miss_dit>0 & count_miss_dit<ntot_hash_adm & is.na(disch_date)~ "1c.b.2.cases w/missing discharge date, removed missed discharge dates[*]",
  count_miss_dit>0 & count_miss_dit<ntot_hash_adm & !is.na(disch_date)~ "1c.b.2.cases w/missing discharge date, removed missed discharge dates", T~"")) |>   
  tidytable::filter(!DECISION2 == "1c.b.2.cases w/missing discharge date, removed missed discharge dates[*]")

message(paste0("2.Groups that still have more than one entry: ",
               kept_1cb2 |>  tidytable::group_by(hash_key, adm_date_num) |> 
    tidytable::summarise(n= n()) |> 
    tidytable::filter(n>1) |> nrow()))

2.Groups that still have more than one entry: 330

Code
message(paste0("2.Cases that still have more than one entry: ",
tidytable::group_by(kept_1cb2, hash_key, adm_date_num) |> 
    tidytable::summarise(n= n()) |> 
    tidytable::filter(n>1) |> 
  tidytable::ungroup() |> 
  summarise(sum=sum(n))))

2.Cases that still have more than one entry: 767

Code
#2.Groups that still have more than one entry: 252
#2.Cases that still have more than one entry: 542

if(kept_1cb1 |> 
  tidytable::group_by(hash_key, adm_date_num)|>
  tidytable::mutate(count_miss_dit = sum(is.na(dit), na.rm=T)) |> 
  tidytable::ungroup() |> 
  tidytable::filter(count_miss_dit==ntot_hash_adm)|> 
  nrow()>0){message(paste0("Cases with both observation with missing discharge dates:",
  kept_1cb1 |> 
    tidytable::group_by(hash_key, adm_date_num)|>
    tidytable::mutate(count_miss_dit = sum(is.na(dit), na.rm=T)) |> 
    tidytable::filter(count_miss_dit==ntot_hash_adm)|> 
    tidytable::ungroup() |> nrow()
))}

Cases with both observation with missing discharge dates:14

Code
invisible("From now on, the algorithm would change to keep entries")
disc_1cb3 <-
  kept_1cb2 |> 
  #get cases w/more than one dit,  but not every row is missing
  tidytable::group_by(hash_key, adm_date_num)|>
  tidytable::mutate(count_miss_dit2 = sum(is.na(dit), na.rm=T),
                    ndis_dit = tidytable::n_distinct(dit),
                    ntot_hash_adm2 = n(),
                    rank_by_dit = min_rank(-dit)) |> 
  tidytable::ungroup() |>   
  tidytable::mutate(DECISION3= tidytable::case_when(  
    ndis_dit>1 & count_miss_dit2<ntot_hash_adm2 & rank_by_dit!=1~ "1c.b.3.cases w/different discharge dates, removed entries w/ lower dit[*]",
    ndis_dit>1 & count_miss_dit2<ntot_hash_adm2 & rank_by_dit==1~ "1c.b.3.cases w/different discharge dates, removed entries w/ lower dit",
    T~""))|>   
  tidytable::filter(DECISION3 == "1c.b.3.cases w/different discharge dates, removed entries w/ lower dit[*]") |> 
  pull(rn)

kept_1cb3_5 <-
kept_1cb2 |> 
     #get cases w/more than one dit,  but not every row is missing
     tidytable::group_by(hash_key, adm_date_num)|>
     tidytable::mutate(count_miss_dit2 = sum(is.na(dit), na.rm=T),
                       ndis_dit = tidytable::n_distinct(dit),
                       ntot_hash_adm2 = n(),
                       rank_by_dit = min_rank(-dit))|> 
     tidytable::ungroup() |>   
     tidytable::mutate(DECISION3= tidytable::case_when(  
         ndis_dit>1 & count_miss_dit2<ntot_hash_adm2 & rank_by_dit!=1~ "1c.b.3.cases w/different discharge dates, removed entries w/ lower dit[*]",
         ndis_dit>1 & count_miss_dit2<ntot_hash_adm2 & rank_by_dit==1~ "1c.b.3.cases w/different discharge dates, removed entries w/ lower dit",
         T~""))|>   
     tidytable::mutate(TABLE_rec= ifelse(nchar(TABLE_rec) < 5,paste0(TABLE_rec, "0"), as.character(TABLE_rec)))|>
     tidytable::mutate(TABLE_rec2= readr::parse_number(TABLE_rec)/10)|>
     tidytable::group_by(concat)|> 
     tidytable::filter( (grepl("\\[\\*\\]", DECISION3) & TABLE_rec2 > 2012 & senda != "No" & n_col_miss < max(n_col_miss[!grepl("\\[\\*\\]", DECISION3)], na.rm = TRUE) & n_col_empty < max(n_col_empty[!grepl("\\[\\*\\]", DECISION3)], na.rm = TRUE))| !grepl("\\[\\*\\]", DECISION3))|> 
  #add admission date
     tidytable::ungroup() |> 
     tidytable::group_by(concat) |> 
  #if there is a flagged case, there are no missing discharge dates and days in treatment are not the shorter, then replace the admission date with the discharge date of the shorter treatment and calculate the days in treatment
     tidytable::mutate(
      adm_date_rec = tidytable::case_when(
        any(grepl("\\[\\*\\]", DECISION3)) & !is.na(disch_date[which.min(dit)]) & dit!=dit[which.min(dit)] ~
          adm_date[which.max(dit)] + as.numeric(disch_date[which.min(dit)] - adm_date[which.max(dit)]),
        TRUE ~ adm_date)
    ) |> 
  tidytable::ungroup() |> 
  tidytable::mutate(dit_rec = as.numeric(disch_date - adm_date_rec), adm_date_rec_num= as.numeric(adm_date_rec)) |> 
     tidytable::ungroup() |> 
  #0 days in tr.
# [1] "52eeaf6394e67f4d8957ea733e1aeb6cf82c759a8e77f00eb24e685c95fc60d2"
# [2] "66415234677fd4f8454fe77532a56a5ac9dbdeadb6766539a1ce06fdd33b83f7"
     tidytable::filter(
         tidytable::case_when(grepl("\\[\\*\\]", DECISION3) & 
                                dit_rec==0~F,T~T)) 
#To check strange cases
 # kept_1cb3_5 |> #349
 #     tidytable::group_by(concat) |> 
 #     filter(any(grepl("\\[\\*\\]", DECISION3))) |> View()

message(paste0("3.5.Groups that still have more than one entry: ",
               kept_1cb3_5 |>  tidytable::group_by(hash_key, adm_date_rec_num)|>
                   tidytable::summarise(n= n())|> 
                   tidytable::filter(n>1)|> nrow()))

3.5.Groups that still have more than one entry: 146

Code
message(paste0("3.5.Cases that still have more than one entry: ",
               tidytable::group_by(kept_1cb3_5, hash_key, adm_date_rec_num)|> 
                   tidytable::summarise(n= n())|> 
                   tidytable::filter(n>1)|> 
                   tidytable::ungroup() |> 
                   summarise(sum=sum(n))))

3.5.Cases that still have more than one entry: 298

Code
# Groups that still have more than one entry: 5
# Cases that still have more than one entry: 10
#AUG 2025
#3.5.Groups that still have more than one entry: 29
#3.5.Cases that still have more than one entry: 58

#f4dbe23abde5d2bb5a48d69a9e3caebb470754a8b4185997022b307b08fa0b26, originally 3 cases, delete the lower dit, left 2

kept_1cb3 <-
  kept_1cb2 |> 
  #get cases w/more than one dit,  but not every row is missing
  tidytable::group_by(hash_key, adm_date_num)|>
  tidytable::mutate(count_miss_dit2 = sum(is.na(dit), na.rm=T),
                    ndis_dit = tidytable::n_distinct(dit),
                    ntot_hash_adm2 = n(),
                    rank_by_dit = min_rank(-dit)) |> 
  tidytable::ungroup() |>   
  tidytable::mutate(DECISION3= tidytable::case_when(  
    ndis_dit>1 & count_miss_dit2<ntot_hash_adm2 & rank_by_dit!=1~ "1c.b.3.cases w/different discharge dates, removed entries w/ lower dit[*]",
    ndis_dit>1 & count_miss_dit2<ntot_hash_adm2 & rank_by_dit==1~ "1c.b.3.cases w/different discharge dates, removed entries w/ lower dit",
    T~""))|>   
  tidytable::filter(!DECISION3 == "1c.b.3.cases w/different discharge dates, removed entries w/ lower dit[*]") |> 
    tidytable::mutate(adm_date_rec= adm_date) |> 
    tidytable::mutate(dit_rec = as.numeric(disch_date - adm_date_rec),
                      adm_date_rec_num= as.numeric(adm_date_rec))

message(paste0("3.Groups that still have more than one entry: ",
               kept_1cb3 |>  tidytable::group_by(hash_key, adm_date_num) |> 
                 tidytable::summarise(n= n()) |> 
                 tidytable::filter(n>1) |> nrow()))

3.Groups that still have more than one entry: 146

Code
message(paste0("3.Cases that still have more than one entry: ",
               tidytable::group_by(kept_1cb3, hash_key, adm_date_num) |> 
                 tidytable::summarise(n= n()) |> 
                 tidytable::filter(n>1) |> 
                 tidytable::ungroup() |> 
                 summarise(sum=sum(n))))

3.Cases that still have more than one entry: 298

Code
#Groups that still have more than one entry: 5
#Cases that still have more than one entry: 10
#AUG 2025
#3.Groups that still have more than one entry: 29
#3.Cases that still have more than one entry: 58

kept_1cb3_3_5 <- bind_rows(
  kept_1cb3 %>% mutate(source = "kept_1cb3"),
  kept_1cb3_5 %>% mutate(source = "kept_1cb3_5")
) %>%
  #order by hash_key, original admission date and source
  tidytable::arrange(hash_key, adm_date, desc(source)) %>% # Primero ordenamos por la fuente, priorizando kept_1cb3_5
  tidytable::distinct(rn, .keep_all = TRUE) %>% # Nos quedamos con el primer valor único de rn, que será de kept_1cb3_5 si hay duplicados
  tidytable::select(-source) # Eliminamos la columna auxiliar


message(paste0("3 & 3.5.Groups that still have more than one entry: ",
               bind_rows(kept_1cb3_3_5) |>  tidytable::group_by(hash_key, adm_date_rec_num) |> 
                 tidytable::summarise(n= n()) |> 
                 tidytable::filter(n>1) |> nrow()))

3 & 3.5.Groups that still have more than one entry: 146

Code
message(paste0("3 & 3.5.Cases that still have more than one entry: ",
               tidytable::group_by(bind_rows(kept_1cb3_3_5), hash_key, adm_date_rec_num) |> 
                 tidytable::summarise(n= n()) |> 
                 tidytable::filter(n>1) |> 
                 tidytable::ungroup() |> 
                 summarise(sum=sum(n))))

3 & 3.5.Cases that still have more than one entry: 298

Code
#3 & 3.5.Groups that still have more than one entry: 22
#3 & 3.5.Cases that still have more than one entry: 44
#if cases with one entry are the same using adm_date_num and adm_date_num_rec, the replacement is bad done
#3 & 3.5.Groups that still have more than one entry: 5
#3 & 3.5.Cases that still have more than one entry: 10
#Aug 2025
#3 & 3.5.Groups that still have more than one entry: 29
#3 & 3.5.Cases that still have more than one entry: 58

disc_1cb4 <-
  kept_1cb3_3_5 |> 
  #filter days in treatment with missing values
  #tidytable::filter(is.na(dit)) |>
  #rank by missing and empty columns
  tidytable::group_by(hash_key, adm_date_rec_num)|>
  tidytable::mutate(
    rank_by_missing = min_rank(n_col_miss),
    rank_by_empty = min_rank(n_col_empty),
    ndis_miss_data = tidytable::n_distinct(n_col_miss),
    ndis_empty_data = tidytable::n_distinct(n_col_empty)) |> 
  tidytable::ungroup() |>   
  tidytable::mutate(DECISION4= tidytable::case_when(  
    (ndis_miss_data>1 | ndis_empty_data>1) & rank_by_missing!=1~ "1c.b.4.cases w/different amount of missing data, removed entries w/ more missingness[*]",
    (ndis_miss_data>1 | ndis_empty_data>1) & rank_by_missing==1~ "1c.b.4.cases w/different amount of missing data, removed entries w/ more missingness",
    T~""))|>   
  tidytable::filter(DECISION4 == "1c.b.4.cases w/different amount of missing data, removed entries w/ more missingness[*]") |>
  pull(rn)
kept_1cb4 <-
  kept_1cb3_3_5 |> 
  #filter days in treatment with missing values
  #tidytable::filter(is.na(dit)) |>
  #rank by missing and empty columns
  tidytable::group_by(hash_key, adm_date_rec_num)|>
  tidytable::mutate(
    rank_by_missing = min_rank(n_col_miss),
    rank_by_empty = min_rank(n_col_empty),
    ndis_miss_data = tidytable::n_distinct(n_col_miss),
    ndis_empty_data = tidytable::n_distinct(n_col_empty)) |> 
  tidytable::ungroup() |> 
  tidytable::mutate(DECISION4= tidytable::case_when(  
    (ndis_miss_data>1 | ndis_empty_data>1) & rank_by_missing!=1~ "1c.b.4.cases w/different amount of missing data, removed entries w/ more missingness[*]",
    (ndis_miss_data>1 | ndis_empty_data>1) & rank_by_missing==1~ "1c.b.4.cases w/different amount of missing data, removed entries w/ more missingness",
    T~""))|>   
  tidytable::filter(!DECISION4 == "1c.b.4.cases w/different amount of missing data, removed entries w/ more missingness[*]")

message(paste0("4.Groups that still have more than one entry: ",kept_1cb4 |>  tidytable::group_by(hash_key, adm_date_rec_num) |> 
    tidytable::summarise(n= n()) |> 
    tidytable::filter(n>1) |> nrow()))

4.Groups that still have more than one entry: 17

Code
message(paste0("4.Cases that still have more than one entry: ",
tidytable::group_by(kept_1cb4, hash_key, adm_date_rec_num) |> 
    tidytable::summarise(n= n()) |> 
    tidytable::filter(n>1) |> 
  tidytable::ungroup() |> 
  summarise(sum=sum(n))))

4.Cases that still have more than one entry: 34

Code
#4.Groups that still have more than one entry: 3
#4.Cases that still have more than one entry: 6
#AUG 2025
#4.Groups that still have more than one entry: 27
#4.Cases that still have more than one entry: 54

disc_1cb5 <-
kept_1cb4 |> 
  tidytable::mutate(TABLE_rec= ifelse(nchar(TABLE_rec) < 5,paste0(TABLE_rec, "0"), as.character(TABLE_rec)), TABLE_rec= as.numeric(TABLE_rec)/10)|>
  # Sort by hash and admission date from most recent to oldest, with the retrieval year of the database 
  # in descending order (starting with the most recent) and discharge date in descending order (most recent in the first row).
  tidytable::arrange(hash_key, -adm_date_rec_num, -TABLE_rec, -disch_date)|>
  tidytable::group_by(hash_key, adm_date_rec_num)|>
  tidytable::mutate(first_miss = row_number() == 1 & is.na(disch_date))|>
  tidytable::mutate(
    cnt_first_miss = sum(first_miss==TRUE, na.rm=T),
    rank_retrieval_yr = min_rank(-TABLE_rec),
    ndis_TABLE_rec = tidytable::n_distinct(TABLE_rec))|>
  tidytable::ungroup() |> 
  tidytable::mutate(DECISION5= tidytable::case_when(  
  cnt_first_miss==0 & ndis_TABLE_rec>1 & rank_retrieval_yr!=1~ "1c.b.5.cases w/different retrieval yrs: earlier retrievals had no missing disch.dates, removed entries from previous retrieval yrs[*]",
  cnt_first_miss==0 & ndis_TABLE_rec>1 & rank_retrieval_yr==1~ "1c.b.5.cases w/different retrieval yrs: earlier retrievals had no missing disch.dates, removed entries from previous retrieval yrs",
  T~""))|>   
  tidytable::filter(DECISION5 == "1c.b.5.cases w/different retrieval yrs: earlier retrievals had no missing disch.dates, removed entries from previous retrieval yrs[*]")|> 
  pull(rn)
kept_1cb5 <-
kept_1cb4 |> 
  tidytable::mutate(TABLE_rec= ifelse(nchar(TABLE_rec) < 5,paste0(TABLE_rec, "0"), as.character(TABLE_rec)), TABLE_rec= as.numeric(TABLE_rec)/10)|>
  # Sort by hash and admission date from most recent to oldest, with the retrieval year of the database 
  # in descending order (starting with the most recent) and discharge date in descending order (most recent in the first row).
  tidytable::arrange(hash_key, -adm_date_rec_num, -TABLE_rec, -disch_date)|>
  tidytable::group_by(hash_key, adm_date_rec_num)|>
  tidytable::mutate(first_miss = row_number() == 1 & is.na(disch_date))|>
  tidytable::mutate(
    cnt_first_miss = sum(first_miss==TRUE, na.rm=T),
    rank_retrieval_yr = min_rank(-TABLE_rec),
    ndis_TABLE_rec = tidytable::n_distinct(TABLE_rec))|>
  tidytable::ungroup() |> 
  tidytable::mutate(DECISION5= tidytable::case_when(  
  cnt_first_miss==0 & ndis_TABLE_rec>1 & rank_retrieval_yr!=1~ "1c.b.5.cases w/different retrieval yrs: earlier retrievals had no missing disch.dates, removed entries from previous retrieval yrs[*]",
  cnt_first_miss==0 & ndis_TABLE_rec>1 & rank_retrieval_yr==1~ "1c.b.5.cases w/different retrieval yrs: earlier retrievals had no missing disch.dates, removed entries from previous retrieval yrs",
  T~""))|>   
  tidytable::filter(!DECISION5 == "1c.b.5.cases w/different retrieval yrs: earlier retrievals had no missing disch.dates, removed entries from previous retrieval yrs[*]")

message(paste0("5.Groups that still have more than one entry: ",kept_1cb5 |>  tidytable::group_by(hash_key, adm_date_rec_num) |> 
    tidytable::summarise(n= n()) |> 
    tidytable::filter(n>1) |> nrow()))

5.Groups that still have more than one entry: 3

Code
message(paste0("5.Cases that still have more than one entry: ",
tidytable::group_by(kept_1cb5, hash_key, adm_date_rec_num) |> 
    tidytable::summarise(n= n()) |> 
    tidytable::filter(n>1) |> 
  tidytable::ungroup() |> 
  summarise(sum=sum(n))))

5.Cases that still have more than one entry: 6

Code
# Groups that still have more than one entry: 0
# Cases that still have more than one entry: 0
#AUG 2025
#5.Groups that still have more than one entry: 6
#5.Cases that still have more than one entry: 12

disc_1cb6 <-
kept_1cb5 |> 
  tidytable::group_by(hash_key, adm_date_rec_num)|>
  #count amount missing days in treatment
  #count number of rows
  tidytable::mutate(count_miss_dit3 = sum(is.na(dit), na.rm=T),
                    ntot_hash_adm3 = n(),
                    ndis_TABLE_rec2 = tidytable::n_distinct(TABLE_rec),
                    ndis_disch_date = tidytable::n_distinct(disch_date),
                    rn_hash_adm= row_number())|> 
  tidytable::ungroup() |> 
  #consider that the database is already ordered
  tidytable::mutate(DECISION6= tidytable::case_when(
  ndis_TABLE_rec2==1 & ndis_disch_date==1 & rn_hash_adm!=1~ "1c.b.6.cases w/ same retrieval yrs and disch. dates, removed entries from previous retrieval yrs[*]",
  ndis_TABLE_rec2==1 & ndis_disch_date==1 & rn_hash_adm==1~ "1c.b.6.cases w/ same retrieval yrs and disch. dates, removed entries from previous retrieval yrs",T~""))|>   
  tidytable::filter(DECISION6 == "1c.b.6.cases w/ same retrieval yrs and disch. dates, removed entries from previous retrieval yrs[*]")|> 
  pull(rn)
kept_1cb6 <-
kept_1cb5 |> 
  tidytable::group_by(hash_key, adm_date_rec_num)|>
  #count amount missing days in treatment
  #count number of rows
  tidytable::mutate(count_miss_dit3 = sum(is.na(dit_rec), na.rm=T),
                    ntot_hash_adm3 = n(),
                    ndis_TABLE_rec2 = tidytable::n_distinct(TABLE_rec),
                    ndis_disch_date = tidytable::n_distinct(disch_date),
                    rn_hash_adm= row_number())|> 
  tidytable::ungroup() |> 
  #consider that the database is already ordered
  tidytable::mutate(DECISION6= tidytable::case_when(
  ndis_TABLE_rec2==1 & ndis_disch_date==1 & rn_hash_adm!=1~ "1c.b.6.cases w/ same retrieval yrs and disch. dates, removed entries from previous retrieval yrs[*]",
  ndis_TABLE_rec2==1 & ndis_disch_date==1 & rn_hash_adm==1~ "1c.b.6.cases w/ same retrieval yrs and disch. dates, removed entries from previous retrieval yrs",T~""))|>   
  tidytable::filter(!DECISION6 == "1c.b.6.cases w/ same retrieval yrs and disch. dates, removed entries from previous retrieval yrs[*]")

message(paste0("6.Groups that still have more than one entry: ",kept_1cb6 |>  tidytable::group_by(hash_key, adm_date_rec_num) |> 
    tidytable::summarise(n= n()) |> 
    tidytable::filter(n>1) |> nrow()))

6.Groups that still have more than one entry: 3

Code
message(paste0("6.Cases that still have more than one entry: ",
tidytable::group_by(kept_1cb6, hash_key, adm_date_rec_num) |> 
    tidytable::summarise(n= n()) |> 
    tidytable::filter(n>1) |> 
  tidytable::ungroup() |> 
  summarise(sum=sum(n))))

6.Cases that still have more than one entry: 6

Code
# Groups that still have more than one entry: 0
# Cases that still have more than one entry: 0
# AUG 2025
#6.Groups that still have more than one entry: 6
#6.Cases that still have more than one entry: 12
disc_1cb7 <-
kept_1cb6 |> 
  tidytable::group_by(hash_key, adm_date_rec_num)|>
  #count amount missing days in treatment
  #count number of rows
  tidytable::mutate(count_miss_dit4 = sum(is.na(dit_rec), na.rm=T),
                    ntot_hash_adm4 = n(),
                    rn_hash_adm2= row_number())|> 
  tidytable::ungroup() |> 
  #consider that the database is already ordered
  tidytable::mutate(DECISION7= tidytable::case_when(
  count_miss_dit4==ntot_hash_adm4 & rn_hash_adm2!=1~ "1c.b.7.cases w/ only missing disch. dates, get the last sorted entry[*]",
  count_miss_dit4==ntot_hash_adm4 & rn_hash_adm2!=1~ "1c.b.7.cases w/ only missing disch. dates, get the last sorted entry",T~""))|>   
  tidytable::filter(DECISION7 == "1c.b.7.cases w/ only missing disch. dates, get the last sorted entry[*]")|> 
  pull(rn)
kept_1cb7 <-
kept_1cb6 |> 
  tidytable::group_by(hash_key, adm_date_rec_num)|>
  #count amount missing days in treatment
  #count number of rows
  tidytable::mutate(count_miss_dit4 = sum(is.na(dit), na.rm=T),
                    ntot_hash_adm4 = n(),
                    rn_hash_adm2= row_number())|> 
  tidytable::ungroup() |> 
  #consider that the database is already ordered
  tidytable::mutate(DECISION7= tidytable::case_when(
  count_miss_dit4==ntot_hash_adm4 & rn_hash_adm2!=1~ "1c.b.7.cases w/ only missing disch. dates, get the last sorted entry[*]",
  count_miss_dit4==ntot_hash_adm4 & rn_hash_adm2!=1~ "1c.b.7.cases w/ only missing disch. dates, get the last sorted entry",T~""))|>   
  tidytable::filter(!DECISION7 == "1c.b.7.cases w/ only missing disch. dates, get the last sorted entry[*]")

message(paste0("7.Groups that still have more than one entry: ",kept_1cb7 |>  tidytable::group_by(hash_key, adm_date_rec_num) |> 
    tidytable::summarise(n= n()) |> 
    tidytable::filter(n>1) |> nrow()))

7.Groups that still have more than one entry: 0

Code
message(paste0("7.Cases that still have more than one entry: ",
tidytable::group_by(kept_1cb7, hash_key, adm_date_rec_num) |> 
    tidytable::summarise(n= n()) |> 
    tidytable::filter(n>1) |> 
  tidytable::ungroup() |> 
  summarise(sum=sum(n))))

7.Cases that still have more than one entry: 0

Code
#7.Groups that still have more than one entry: 0
#7.Cases that still have more than one entry: 0
#AUG 2025
#7.Groups that still have more than one entry: 0
#7.Cases that still have more than one entry: 0

From this part of the process, we removed 615 entries from the database.

Code
kept_1ca<-
kept_1ca7[,c("rn",paste0("DECISION",1:7),"avg_age","avg_birth_date_num","avg_onset_age","avg_primary_sub_onset_age")] %>%
  tidytable::mutate(
    obs_1ca = apply(.[, paste0("DECISION", 1:7)], 1, function(row) {
      glue::glue_collapse(row, sep = "::")
    })) |> 
           tidytable::select(-any_of(paste0("DECISION",1:7))) |>
  tidytable::mutate(obs_1ca=gsub(":+", ";", obs_1ca)) |> 
  tidytable::mutate(obs_1ca=gsub("^;", "", obs_1ca)) |> 
  tidytable::mutate(obs_1ca=gsub(";", "; ", obs_1ca))

kept_1cb<-
kept_1cb7[,c("rn",paste0("DECISION",1:7), "tipode_plan_conc", "tipode_prog_conc", "avg_age", "avg_birth_date_num", "avg_onset_age", "avg_primary_sub_onset_age", "adm_date_rec", "dit_rec", "adm_date_rec_num")]%>%
  tidytable::mutate(
    obs_1cb = apply(.[, paste0("DECISION", 1:7)], 1, function(row) {
      glue::glue_collapse(row, sep = "::")
    })) |> 
      tidytable::select(-any_of(paste0("DECISION",1:7))) |> 
  tidytable::mutate(obs_1cb=gsub(":+", ";", obs_1cb)) |> 
  tidytable::mutate(obs_1cb=gsub("^;", "", obs_1cb)) |> 
  tidytable::mutate(obs_1cb=gsub(";", "; ", obs_1cb))

invisible("====================================================")
invisible("eliminate discarded entries")
SISTRAT23_c1_2010_2024_df_prev1c<-
 SISTRAT23_c1_2010_2024_df_prev1b|>
  tidytable::filter(!rn %in% setdiff(hash_adm_date_1ca$rn ,kept_1ca7$rn))|>   
  tidytable::filter(!rn %in% setdiff(hash_adm_date_1cb$rn ,kept_1cb7$rn)) |> 
  tidytable::mutate(adm_date_num=as.numeric(as.Date(adm_date)),
  disch_date_num= as.numeric(as.Date(disch_date)), 
  dit= disch_date_num-adm_date_num)|>
  #join rows and info. of the filtered rows
  tidylog::left_join(kept_1ca, by="rn") |> 
  tidylog::left_join(kept_1cb, by="rn") |> 
  #add info to the observation column
  tidytable::mutate(OBS = tidytable::case_when( !is.na(obs_1ca)~ glue("{OBS};{obs_1ca}"),T~OBS), OBS = tidytable::case_when(!is.na(obs_1cb)~ glue("{OBS};{obs_1cb}"), T~OBS)) |>
  #fill columns
  tidytable::mutate(
    avg_age = case_when(!is.na(avg_age.x) ~ avg_age.x, T ~ avg_age.y),
    avg_birth_date_num = case_when(!is.na(avg_birth_date_num.x) ~ avg_birth_date_num.x, T ~ avg_birth_date_num.y),
    avg_onset_age = case_when(!is.na(avg_onset_age.x) ~ avg_onset_age.x, T ~ avg_onset_age.y),
    avg_primary_sub_onset_age = case_when(!is.na(avg_primary_sub_onset_age.x) ~ avg_primary_sub_onset_age.x, T ~ avg_primary_sub_onset_age.y),
    adm_date_rec = case_when(!is.na(adm_date_rec)~adm_date_rec, T~adm_date),
    dit_rec = case_when(!is.na(dit_rec)~dit_rec, T~dit),
    adm_date_rec_num = case_when(!is.na(adm_date_rec_num)~adm_date_rec_num, T~adm_date_num)
  ) |> 
  #correct observation column
   tidytable::mutate(OBS= gsub("^;", "", OBS)) |> 
   tidytable::mutate(OBS= gsub("^;", "", OBS)) |> 
   tidytable::mutate(OBS= gsub("^;", "", OBS)) |> 
   tidytable::mutate(OBS= gsub("^;", "", OBS)) |> 
   tidytable::mutate(OBS= gsub("^;", "", OBS)) |> 
  #eliminate residual columns
   tidytable::select(-obs_1ca, -obs_1cb, -perfect_dup, -any_of( c("avg_age.x", "avg_birth_date_num.x", "avg_onset_age.x", "avg_primary_sub_onset_age.x", "tipode_plan_conc", "tipode_prog_conc", "avg_age.y", "avg_birth_date_num.y", "avg_onset_age.y", "avg_primary_sub_onset_age.y"))) |> 
  tidytable::filter(!hash_key== "c46caa3cd2c89a2222ce319cf6f5e98392f928e0544ee5487ff6bedc1d3e76c2")

left_join: added 5 columns (avg_age, avg_birth_date_num, avg_onset_age, avg_primary_sub_onset_age, obs_1ca) > rows only in tidytable::mutate(tidyt.. 131,819 > rows only in kept_1ca ( 0) > matched rows 42,307 > ========= > rows total 174,126 left_join: added 14 columns (avg_age.x, avg_birth_date_num.x, avg_onset_age.x, avg_primary_sub_onset_age.x, tipode_plan_conc, …) > rows only in tidylog::left_join(tidy.. 173,675 > rows only in kept_1cb ( 0) > matched rows 451 > ========= > rows total 174,126

Code
if(   
SISTRAT23_c1_2010_2024_df_prev1c |> 
    tidytable::group_by(hash_key, adm_date_rec_num) |> 
    tidytable::summarise(n=n()) |> 
    tidytable::filter(n>1) |> nrow()>1){stop("There are still duplicated entries with the same HASH and admission date")}

c46caa3cd2c89a2222ce319cf6f5e98392f928e0544ee******************* was excluded from the analysis because it did not have information on the admission date.


2. Age in Datasets

Age is a unit-invariant variable that can help standardize users and, in turn, differentiate treatments. Up to this point, we had only used fuzzy criteria to distinguish between different admissions. Therefore, a casuistic approach was necessary to identify duplicate treatments through probabilistic deduplication. These strategies are illustrated in the primary data preparation diagram. However, many cases have invalid (n = 284), 70,317 and missing ages. What’s more intriguing were the inconsistent ages within the same HASH values (n = 22,988). However, this can be due to changes in the record date.

Differences between dates were calculated by converting them into numeric values, using the “Unix epoch” (1970-01-01) as the reference point. Hence, negative days may appear.

2.1.Rule-based solution to inconsistent dates of birth

Given the inaccuracies and inconsistencies regarding participants’ birth dates and ages, information from additional sources was used, particularly hospital records, mortality registries, Treatment Outcome Profile questionnaires, and SENDA Agreements 2 to 6, in cases where the patient had participated in one of these previously. This approach allows us to retain the most plausible dates and have greater certainty about participants’ ages when experiencing significant health events, based on available records.

Instead of looking on ages, highly dependent on retrieval date of the database, we looked at birth dates. To get this information, we extracted the last 8 characters from the codigo_identificacion (SENDA ID). 3,983 patients had inconsistent birth dates.

We successfully integrated the Prosecutor’s Office records (Base_fiscalia_v2) by linking an older version of the encrypted national ID (datasets sended in May 2023) with the current ID using 98 matching characteristics shared by both SENDA databases (process available in “import_c1_top_data_adm_25.qmd”). For validation, we relied on attributes that are theoretically invariant for patients: sex and birth date. We excluded records with missing birth dates (e.g., entries showing 1900-01-01). In cases of discrepancies within this dataset, we calculated the average birth date. As of August 2025, we added cases that were nondeterministically matched by all these variables, but with SENDA ID and admission date instead (stored in inconsistent_hashs_may23_PO_office_alt).

Code
# [1] "3,713" patients had inconsistent birth dates
# Aug 2025: "3,983" patients had inconsistent birth dates
#base::load(paste0(envpath,"data/20241015_out/","3_ndp_2024_11_08.Rdata"))
invisible("======================================================")
hashs_inconsistent_ages<-
  SISTRAT23_c1_2010_2024_df_prev1c|> tidytable::group_by(hash_key)|> tidytable::summarise(n = n_distinct(edad))|> tidytable::filter(n > 1)|> pull(hash_key)

message(paste0("HASHs with inconsistent ages: ", formatC(length(hashs_inconsistent_ages), big.mark=",")," (not the same as different birth date)\n"))

HASHs with inconsistent ages: 22,988 (not the same as different birth date)

Code
#AUG 2025
# HASHs with inconsistent ages: 14,506 (not the same as different birth date)

invisible("======================================================")
invisible("2024-11-09: many had different ages but same birthdates. Change the criteria")

message(paste0("Patients with only missing values in birth date: ", SISTRAT23_c1_2010_2024_df_prev1c|> tidytable::group_by(hash_key)|> tidytable::summarise(n= n(), na_birth= sum(is.na(birth_date), na.rm=T))|> tidytable::ungroup()|> tidytable::filter(na_birth==n)|> nrow()))

Patients with only missing values in birth date: 2

Code
#AUG 2025
#Patients with only missing values in birth date: 3

hashs_inconsistent_birth_dates<-
  SISTRAT23_c1_2010_2024_df_prev1c|> tidytable::group_by(hash_key)|> tidytable::summarise(n= tidytable::n_distinct(birth_date))|> tidytable::filter(n > 1)|> pull(hash_key)

hashs_invalid_birth_dates<-
  SISTRAT23_c1_2010_2024_df_prev1c|> tidytable::group_by(hash_key)|> tidytable::summarise(n= tidytable::n_distinct(birth_date))|> tidytable::filter(n > 1)|> pull(hash_key)

SISTRAT23_c1_2010_2024_df_prev1c<-
SISTRAT23_c1_2010_2024_df_prev1c|>
    (\(df) {
        (message(paste0("Missing birth dates, Entries: ", nrow(df|> tidytable::filter(is.na(birth_date))))))
        (message(paste0("Missing birth dates, RUNs: ", tidytable::distinct(tidytable::filter(df,is.na(birth_date)), hash_key)|> nrow())))
        df
    })()|>
tidytable::group_by(hash_key)|>
tidytable::mutate(
    birth_date = tidytable::coalesce(birth_date, birth_date[!is.na(birth_date)][1])
)|>
tidytable::ungroup() 

Missing birth dates, Entries: 2

Missing birth dates, RUNs: 2

Code
# Missing birth dates, Entries: 0
# Missing birth dates, RUNs: 0
# AUG 2025
# Missing birth dates, Entries: 2
# Missing birth dates, RUNs: 2

message(paste0("Patients with only missing values in birth date, after replacement: ", SISTRAT23_c1_2010_2024_df_prev1c|> tidytable::group_by(hash_key)|> tidytable::summarise(n= n(), na_birth= sum(is.na(birth_date), na.rm=T))|> tidytable::ungroup()|> tidytable::filter(na_birth==n)|> nrow()))

Patients with only missing values in birth date, after replacement: 2

Code
#AUG 2025
#Patients with only missing values in birth date, after replacement: 2


#invisible("No entries with missing birth dates")
invisible("I have three entries with missing birth dates")

invisible("======================================================")
#HOSP_filter_df edad_anos y run
#Edad en años del paciente al momento del ingreso
HOSP_filter_df$fecha_nac<-
    clock::add_years(HOSP_filter_df$fecha_ingreso,-HOSP_filter_df$edad_anos, invalid = "previous") #if invalid day, e.g., 1991-02-29, for 1991-02-28

#00328debf19b4829db5c12d9aa428dbe922e0bd7b46bda1bcc483aa80234a2bb, sin fecha de ingreso, NaN hospitalizaciones
HOSP_filter_df$fecha_ingreso <- ifelse(is.na(HOSP_filter_df$fecha_ingreso) & !is.na(HOSP_filter_df$fecha_egreso) & !is.na(HOSP_filter_df$dias_estad), clock::add_years(HOSP_filter_df$fecha_egreso, -HOSP_filter_df$dias_estad, invalid = "previous"), HOSP_filter_df$fecha_ingreso)
HOSP_filter_df$fecha_ingreso <- as.Date(HOSP_filter_df$fecha_ingreso)
HOSP_filter_df$fecha_nac<- clock::add_years(HOSP_filter_df$fecha_ingreso,-HOSP_filter_df$edad_anos, invalid = "previous") #if invalid day,

inconsistent_hashs_hosp<-  
HOSP_filter_df |> 
    tidytable::filter(run %in% hashs_inconsistent_birth_dates) |>
    (\(df) {
        (message(paste0("Hospital, Entries: ", nrow(df))))
        (message(paste0("Hospital, RUNs: ", tidytable::distinct(df, run) |> nrow())))
        df
    })() |>
    tidytable::distinct(run, fecha_nac) |>
    tidytable::group_by(run) |>
    tidytable::mutate(id = as.character(dplyr::row_number())) |>
    tidytable::pivot_wider(names_from = id, values_from = fecha_nac, 
                           names_prefix = "h_fechnac_")

Hospital, Entries: 10467

Hospital, RUNs: 2770

Code
invisible("Since this measure depended on the admission day and month, we calculated the average birth date.")
inconsistent_hashs_hosp_avg<-  
HOSP_filter_df |> 
  tidytable::filter(run %in% hashs_inconsistent_birth_dates)|> 
  tidytable::distinct(run, fecha_nac)|> 
  tidytable::group_by(run)|> 
  tidytable::summarise(h_avg_birth_date = as.Date(mean(as.numeric(fecha_nac), na.rm=T), origin="1970-01-01"))|> 
  tidytable::ungroup()
#Hospital, Entries: 9957
#Hospital, RUNs: 2593
# Aug 2025
# Hospital, Entries: 10467
# Hospital, RUNs: 2770

invisible("======================================================")
#SISTRAT23_top_2015_2022_df$fecha_nacimiento
inconsistent_hashs_top<-  
SISTRAT23_top_2015_2024_df |>
    tidytable::filter(HASH_KEY %in% hashs_inconsistent_birth_dates) |>
    (\(df) {
        (message(paste0("TOP, Entries: ", nrow(df))))
        (message(paste0("TOP, RUNs: ", tidytable::distinct(df, HASH_KEY) |> nrow())))
        df
    })() |>  
    tidytable::distinct(HASH_KEY, birth_date) |>
    
    tidytable::ungroup() |>
    tidytable::group_by(HASH_KEY) |>
    tidytable::mutate(id = as.character(dplyr::row_number())) |> 
    tidytable::pivot_wider(names_from = id, values_from = birth_date, 
                           names_prefix = "t_fechnac_")

TOP, Entries: 11895

TOP, RUNs: 2455

Code
# TOP, Entries: 10938
# TOP, RUNs: 2321
# Aug 2025
# TOP, Entries: 11895
# TOP, RUNs: 2455

invisible("======================================================")
#CONS_C2$fecha_nacimiento
CONS_C2_25_df$birth_date<-readr::parse_date(CONS_C2_25_df$fecha_nacimiento,"%d/%m/%Y")

inconsistent_hashs_c2<-  
CONS_C2_25_df|>
    tidytable::filter(HASH_KEY %in% hashs_inconsistent_birth_dates) |>
    (\(df) {
        (message(paste0("C2, Entries: ", nrow(df))))
        (message(paste0("C2, RUNs: ", tidytable::distinct(df, HASH_KEY) |> nrow())))
        df
    })() |>  
    tidytable::distinct(HASH_KEY, birth_date) |>
    tidytable::ungroup() |>
    tidytable::group_by(HASH_KEY) |>
    tidytable::mutate(id = as.character(dplyr::row_number())) |>  
    tidytable::pivot_wider(names_from = id, values_from = birth_date, 
                           names_prefix = "c2_fechnac_")

C2, Entries: 55

C2, RUNs: 28

Code
# C2, Entries: 37
# C2, RUNs: 21
# Aug 2025
# C2, Entries: 55
# C2, RUNs: 28

invisible("======================================================")
#CONS_C3$edad
CONS_C3_25_df$birth_date<-
  stringr::str_sub(CONS_C3_25_df$codigo_identificacion, nchar(CONS_C3_25_df$codigo_identificacion)-7,nchar(CONS_C3_25_df$codigo_identificacion)) 
CONS_C3_25_df$birth_date<-  readr::parse_date(CONS_C3_25_df$birth_date, format="%d%m%Y") 

inconsistent_hashs_c3<-  
CONS_C3_25_df|>
    tidytable::filter(HASH_KEY %in% hashs_inconsistent_birth_dates) |>
    (\(df) {
        (message(paste0("C3, Entries: ", nrow(df))))
        (message(paste0("C3, RUNs: ", tidytable::distinct(df, HASH_KEY) |> nrow())))
        df
    })() |>  
    tidytable::distinct(HASH_KEY, birth_date) |>
    tidytable::ungroup() |>
    tidytable::group_by(HASH_KEY) |>
    tidytable::mutate(id = as.character(dplyr::row_number())) |> 
    tidytable::pivot_wider(names_from = id, values_from = birth_date, 
                           names_prefix = "c3_fechnac_")

C3, Entries: 83

C3, RUNs: 54

Code
# C3, Entries: 64
# C3, RUNs: 50
# Aug 2025
# C3, Entries: 83
# C3, RUNs: 54

invisible("======================================================")
#CONS_C4$fechanacimiento
CONS_C4_25_df$birth_date <- readr::parse_date(CONS_C4_25_df$fechanacimiento,"%d/%m/%Y")
inconsistent_hashs_c4<-  
CONS_C4_25_df|>
    tidytable::filter(HASH_KEY %in% hashs_inconsistent_birth_dates) |>
    (\(df) {
        (message(paste0("C4, Entries: ", nrow(df))))
        (message(paste0("C4, RUNs: ", tidytable::distinct(df, HASH_KEY) |> nrow())))
        df
    })() |>  
    tidytable::distinct(HASH_KEY, birth_date) |>
    tidytable::ungroup() |>
    tidytable::group_by(HASH_KEY) |>
    tidytable::mutate(id = as.character(dplyr::row_number())) |>  
    tidytable::pivot_wider(names_from = id, values_from = birth_date, 
                           names_prefix = "c4_fechnac_")

C4, Entries: 43

C4, RUNs: 29

Code
# C4, Entries: 37
# C4, RUNs: 26
# Aug 2025
# C4, Entries: 43
# C4, RUNs: 29

invisible("======================================================")
#CONS_C5$fecha_nacimiento
CONS_C5_25_df$birth_date<-readr::parse_date(CONS_C5_25_df$fecha_nacimiento,"%d/%m/%Y")

inconsistent_hashs_c5<-  
CONS_C5_25_df|>
    tidytable::filter(HASH_KEY %in% hashs_inconsistent_birth_dates) |>
    (\(df) {
        (message(paste0("C5, Entries: ", nrow(df))))
        (message(paste0("C5, RUNs: ", tidytable::distinct(df, HASH_KEY) |> nrow())))
        df
    })() |>  
    tidytable::distinct(HASH_KEY, birth_date) |>
    tidytable::ungroup() |>
    tidytable::group_by(HASH_KEY) |>
    tidytable::mutate(id = as.character(dplyr::row_number())) |>  
    tidytable::pivot_wider(names_from = id, values_from = birth_date, 
                           names_prefix = "c5_fechnac_")

C5, Entries: 10

C5, RUNs: 9

Code
# C5, Entries: 7
# C5, RUNs: 6
# Aug 2025
# C5, Entries: 10
# C5, RUNs: 9

invisible("======================================================")
#CONS_C6$fechanacimiento
CONS_C6_25_df$birth_date<-readr::parse_date(CONS_C6_25_df$fechanacimiento,"%d/%m/%Y")

inconsistent_hashs_c6<-  
CONS_C6_25_df|>
    tidytable::filter(HASH_KEY %in% hashs_inconsistent_birth_dates) |>
    (\(df) {
        (message(paste0("C6, Entries: ", nrow(df))))
        (message(paste0("C6, RUNs: ", tidytable::distinct(df, HASH_KEY) |> nrow())))
        df
    })() |>  
    tidytable::distinct(HASH_KEY, birth_date) |>
    tidytable::ungroup() |>
    tidytable::group_by(HASH_KEY) |>
    tidytable::mutate(id = as.character(dplyr::row_number())) |>  
    tidytable::pivot_wider(names_from = id, values_from = birth_date, 
                           names_prefix = "c6_fechnac_")

C6, Entries: 19

C6, RUNs: 17

Code
# C6, Entries: 10
# C6, RUNs: 9
# Aug 2025
# C6, Entries: 19
# C6, RUNs: 17

invisible("======================================================")
inconsistent_hashs_mortality<-  
mortality |>
    tidytable::filter(hashkey %in% hashs_inconsistent_birth_dates) |>
    (\(df) {
        (message(paste0("Mortality, Entries: ", nrow(df))))
        (message(paste0("Mortality, RUNs: ", tidytable::distinct(df, hashkey) |> nrow())))
        df
    })() |>  
    tidytable::distinct(hashkey, birth_date) |>
    tidytable::ungroup() |> 
    tidytable::rename("m_birthdate"="birth_date")

Mortality, Entries: 172

Mortality, RUNs: 172

Code
# Mortality, Entries: 171
# Mortality, RUNs: 171
# Aug 2025
# Mortality, Entries: 172
# Mortality, RUNs: 172

invisible("======================================================")
wdpath<-
paste0(gsub("/cons","",gsub("cons","",paste0(getwd(),"/cons"))))
wdpath
envpath<- if(regmatches(wdpath, regexpr("[A-Za-z]+", wdpath))=="G"){"G:/Mi unidad/Alvacast/SISTRAT 2023/"}else{"E:/Mi unidad/Alvacast/SISTRAT 2023/"}
envpath

invisible("Construct PO Office database, clean names, discard missing birth dates and select sex and birth date, then get the mean of birth dates by HASH (in case of discrepancies")
Base_fiscalia_v2 <- readxl::read_excel(paste0(sub("2023","2019 \\(github\\)",wdpath),"/Base_v3_dic_2021.xlsx"), 
sheet = "Base", skip = 4, guess_max = min(1000000, Inf))|>   janitor::clean_names()|>   
  dplyr::mutate(gls_region=dplyr::case_when(
    gls_region=="REGION METROPOLITANA CENTRO NORTE"~ "RM Centro Norte",
    gls_region=="REGION METROPOLITANA OCCIDENTE"~ "RM Occidente",
    gls_region=="REGION METROPOLITANA ORIENTE"~ "RM Oriente",
    gls_region=="REGION METROPOLITANA SUR"~ "RM Sur",
    T~gls_region)) %>% 
  dplyr::mutate(region_delito=dplyr::case_when(
    region_delito=="REGION METROPOLITANA CENTRO NORTE"~ "RM Centro Norte",
    region_delito=="REGION METROPOLITANA OCCIDENTE"~ "RM Occidente",
    region_delito=="REGION METROPOLITANA ORIENTE"~ "RM Oriente",
    region_delito=="REGION METROPOLITANA SUR"~ "RM Sur",
    T~region_delito))|>
  dplyr::mutate(fec_comision_simple=as.Date(stringr::str_extract(as.character(fec_comision), "^.{10}")))|>   
  dplyr::mutate(fec_cbiorelacion_simple=as.Date(stringr::str_extract(as.character(fec_cbiorelacion), "^.{10}")))|>  
  dplyr::mutate(fec_nacimiento_simple=as.Date(stringr::str_extract(as.character(fec_nacimiento), "^.{10}")))|>  
  dplyr::mutate(termino_relacion_simple=as.Date(stringr::str_extract(as.character(termino_relacion), "^.{10}")))|>   
  dplyr::rename("marca_suspension_43"="marca_suspension_46","marca_pena_44"="marca_pena_47","marca_multa_45"="marca_multa_48","medida_alternativa_46"="medida_alternativa_49","clasificacion_pena_47"="clasificacion_pena_50","tramos_condena_48"="tramos_condena","clasificacion_penarpa_1_49"="clasificacion_penarpa_1_52","clasificacion_penarpa_2_50"="clasificacion_penarpa_2_53","marca_suspension_51"="marca_suspension_54","marca_pena_52"="marca_pena_55","marca_multa_53"="marca_multa_56","medida_alternativa_54"="medida_alternativa_57","clasificacion_pena_55"="clasificacion_pena_58","tramos_condena_56"="tramos_condena_2","clasificacion_penarpa_1_57"="clasificacion_penarpa_1_60","clasificacion_penarpa_2_58"="clasificacion_penarpa_2_61")|>  
    dplyr::mutate(edad_comision=(unclass(fec_comision_simple)-unclass(fec_nacimiento_simple))/365.25,
                  edad_ter_rel=(unclass(termino_relacion_simple)-unclass(fec_nacimiento_simple))/365.25)

New names: • MARCA_SUSPENSION -> MARCA_SUSPENSION...46MARCA_PENA -> MARCA_PENA...47MARCA_MULTA -> MARCA_MULTA...48MEDIDA_ALTERNATIVA -> MEDIDA_ALTERNATIVA...49CLASIFICACION_PENA -> CLASIFICACION_PENA...50CLASIFICACION_PENARPA_1 -> CLASIFICACION_PENARPA_1...52CLASIFICACION_PENARPA_2 -> CLASIFICACION_PENARPA_2...53MARCA_SUSPENSION -> MARCA_SUSPENSION...54MARCA_PENA -> MARCA_PENA...55MARCA_MULTA -> MARCA_MULTA...56MEDIDA_ALTERNATIVA -> MEDIDA_ALTERNATIVA...57CLASIFICACION_PENA -> CLASIFICACION_PENA...58CLASIFICACION_PENARPA_1 -> CLASIFICACION_PENARPA_1...60CLASIFICACION_PENARPA_2 -> CLASIFICACION_PENARPA_2...61

Code
Base_fiscalia_v2<-  
Base_fiscalia_v2[,c("rut_enc_saf","fec_nacimiento_simple","sexo")]|> 
  tidytable::filter(fec_nacimiento_simple!="1900-01-01")|> 
    tidytable::group_by(rut_enc_saf)|> 
    tidytable::mutate(avg_birth_date_po = mean(fec_nacimiento_simple, na.rm = TRUE), n_dis_birth_date_po= n_distinct(fec_nacimiento_simple))|>
  tidytable::ungroup()


inconsistent_hashs_may23_PO_office<-  
OLD_NEW_SISTRAT23_c1_2010_2024_df2|>
  tidylog::right_join(Base_fiscalia_v2, by=c("HASH_KEY.y"="rut_enc_saf"), multiple="first")|> 
  tidytable::select("HASH_KEY.x","HASH_KEY.y", "sexo.y","avg_birth_date_po")|> 
  tidytable::filter(HASH_KEY.x %in% hashs_inconsistent_birth_dates)|>
    (\(df) {
        (message(paste0("PO Office, Entries: ", nrow(df))))
        (message(paste0("PO Office, RUNs: ", tidytable::distinct(df, HASH_KEY.x) |> nrow())))
        df
    })() #|>  

right_join: added 5 columns (sexo.x, fec_nacimiento_simple, sexo.y, avg_birth_date_po, n_dis_birth_date_po)

        > rows only in OLD_NEW_SISTRAT23_c1_20.. ( 56,867)
        > rows only in Base_fiscalia_v2            30,256
        > matched rows                            607,945    (includes duplicates)
        >                                        =========
        > rows total                              638,201

PO Office, Entries: 13511

PO Office, RUNs: 3435

Code
    # tidytable::distinct(HASH_KEY.x, avg_birth_date_po) |>
    # tidytable::ungroup() 
#Office, Entries: 120446
#PO Office, RUNs: 3288
#AUG 2025
#PO Office, Entries: 13511
#PO Office, RUNs: 3435

inconsistent_hashs_may23_PO_office_alt<-  
OLD_NEW_SISTRAT23_c1_2010_2024_df2_alt|>
  #discard overlappings in HASHs
  tidytable::filter(!HASH_KEY %in% OLD_NEW_SISTRAT23_c1_2010_2024_df2$HASH_KEY.x)|> 
  #join with PO Office
  tidylog::right_join(Base_fiscalia_v2, by=c("HASH_KEY_target"="rut_enc_saf"), multiple="first")|> 
  #select variables of interest
  tidytable::select("HASH_KEY","HASH_KEY_target", "sexo.y","avg_birth_date_po")|> 
  #filter incosistent birth dates only
  tidytable::filter(HASH_KEY %in% hashs_inconsistent_birth_dates)|>
    (\(df) {
        (message(paste0("PO Office (alt., Aug 2025, not deterministically matched), Entries: ", nrow(df))))
        (message(paste0("PO Office (alt., Aug 2025, not deterministically matched), RUNs: ", tidytable::distinct(df, HASH_KEY) |> nrow())))
        df
    })() #|> 

right_join: added 5 columns (sexo.x, fec_nacimiento_simple, sexo.y, avg_birth_date_po, n_dis_birth_date_po) > rows only in tidytable::filter(OLD_N.. ( 1,858) > rows only in Base_fiscalia_v2 524,344 > matched rows 30,694 (includes duplicates) > ========= > rows total 555,038 PO Office (alt., Aug 2025, not deterministically matched), Entries: 87

PO Office (alt., Aug 2025, not deterministically matched), RUNs: 42

Code
# AUG 2025
#PO Office (alt., Aug 2025, non-deterministically matched), Entries: 87
#PO Office (alt., Aug 2025, non-deterministically matched), RUNs: 42
[1] "G:/My Drive/Alvacast/SISTRAT 2023//"
[1] "G:/Mi unidad/Alvacast/SISTRAT 2023/"

We joined the birth dates in wide format.

Code
inconsistent_hashs_h_to_m<-
SISTRAT23_c1_2010_2024_df_prev1c|>
  tidytable::filter(hash_key %in% hashs_inconsistent_birth_dates)|>
  tidylog::left_join(inconsistent_hashs_hosp_avg, by=c("hash_key"="run"))|>
  tidytable::select(hash_key, birth_date, h_avg_birth_date)|>
  tidylog::left_join(inconsistent_hashs_top, by=c("hash_key"="HASH_KEY"))|>
  tidylog::left_join(inconsistent_hashs_c2, by=c("hash_key"="HASH_KEY"))|> 
  tidylog::left_join(inconsistent_hashs_c3, by=c("hash_key"="HASH_KEY"))|> 
  tidylog::left_join(inconsistent_hashs_c4, by=c("hash_key"="HASH_KEY"))|> 
  tidylog::left_join(inconsistent_hashs_c5, by=c("hash_key"="HASH_KEY"))|>
  tidylog::left_join(inconsistent_hashs_c6, by=c("hash_key"="HASH_KEY"))|> 
  tidylog::left_join(inconsistent_hashs_mortality, by=c("hash_key"="hashkey"))|>
  # tidylog::left_join(inconsistent_hashs_c1_2324, by=c("hash_key"="hash_key"))|>
  # tidylog::left_join(inconsistent_hashs_top_2324, by=c("hash_key"="hashkey"))|>
  # tidylog::left_join(inconsistent_hashs_c2_2224, by=c("hash_key"="hashkey"))|>
  tidytable::mutate_rowwise(
    non_NA_count = rowSums(!is.na(across(h_avg_birth_date:m_birthdate)))
  ) |> 
  dplyr::filter(non_NA_count>0)|> 
      (\(df) {
        (message(paste0("Inconsistent birth date that have at least one external birth date, Entries: ", nrow(df))))
        (message(paste0("Inconsistent birth date that have at least one external birth date, RUNs: ", tidytable::distinct(df, hash_key) |> nrow())))
        df
    })() 

left_join: added one column (h_avg_birth_date) > rows only in tidytable::filter(SISTR.. 3,489 > rows only in inconsistent_hashs_hosp.. ( 0) > matched rows 8,663 > ======== > rows total 12,152 left_join: added 3 columns (t_fechnac_1, t_fechnac_2, t_fechnac_3) > rows only in tidytable::select(tidyl.. 3,885 > rows only in inconsistent_hashs_top ( 0) > matched rows 8,267 > ======== > rows total 12,152 left_join: added 3 columns (c2_fechnac_1, c2_fechnac_2, c2_fechnac_3) > rows only in tidylog::left_join(tidy.. 12,061 > rows only in inconsistent_hashs_c2 ( 0) > matched rows 91 > ======== > rows total 12,152 left_join: added 2 columns (c3_fechnac_1, c3_fechnac_2) > rows only in tidylog::left_join(tidy.. 11,946 > rows only in inconsistent_hashs_c3 ( 0) > matched rows 206 > ======== > rows total 12,152 left_join: added one column (c4_fechnac_1) > rows only in tidylog::left_join(tidy.. 12,061 > rows only in inconsistent_hashs_c4 ( 0) > matched rows 91 > ======== > rows total 12,152 left_join: added one column (c5_fechnac_1) > rows only in tidylog::left_join(tidy.. 12,128 > rows only in inconsistent_hashs_c5 ( 0) > matched rows 24 > ======== > rows total 12,152 left_join: added one column (c6_fechnac_1) > rows only in tidylog::left_join(tidy.. 12,091 > rows only in inconsistent_hashs_c6 ( 0) > matched rows 61 > ======== > rows total 12,152 left_join: added one column (m_birthdate) > rows only in tidylog::left_join(tidy.. 11,679 > rows only in inconsistent_hashs_mort.. ( 0) > matched rows 473 > ======== > rows total 12,152 Inconsistent birth date that have at least one external birth date, Entries: 11055

Inconsistent birth date that have at least one external birth date, RUNs: 3538

Code
#25/02/24
#nconsistent birth date that have at least one external birth date, Entries: 10146
#Inconsistent birth date that have at least one external birth date, RUNs: 3316

#AUG 2025
#Inconsistent birth date that have at least one external birth date, Entries: 11055
#Inconsistent birth date that have at least one external birth date, RUNs: 3538


#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:
invisible("Export database to explore it")
wdpath<-
paste0(gsub("/cons","",gsub("cons","",paste0(getwd(),"/cons"))))
envpath<- if(regmatches(wdpath, regexpr("[A-Za-z]+", wdpath))=="G"){"G:/Mi unidad/Alvacast/SISTRAT 2023/"}else{"E:/Mi unidad/Alvacast/SISTRAT 2023/"}
envpath
rio::export(file=paste0(wdpath,"cons/_out/inconsistent_birthdates_25.csv"),inconsistent_hashs_h_to_m)
[1] "G:/Mi unidad/Alvacast/SISTRAT 2023/"

Then we constructed the joined birth dates in long format.

Code
inconsistent_hashs_h_to_m_long<-
inconsistent_hashs_h_to_m|>
    tidytable::group_by(hash_key)|> 
    tidytable::mutate(source_birth= dplyr::row_number())|> 
  tidytable::ungroup()|> 
  tidytable::select(hash_key, source_birth, everything())|> 
  tidytable::mutate(source_birth= paste0("orig_",source_birth))|> 
  tidytable::select(hash_key, source_birth, birth_date)|> 
  #first add hospital data
  tidytable::bind_rows(cbind.data.frame(hash_key= inconsistent_hashs_hosp_avg$run, source_birth= rep("hosp_avg",times= nrow(inconsistent_hashs_hosp_avg)), birth_date= inconsistent_hashs_hosp_avg$h_avg_birth_date))|> 
  tidytable::bind_rows(cbind.data.frame(hash_key= reshape2::melt(inconsistent_hashs_top, id="HASH_KEY")$HASH_KEY, source_birth= as.character(reshape2::melt(inconsistent_hashs_top, id="HASH_KEY")$variable), birth_date= reshape2::melt(inconsistent_hashs_top, id="HASH_KEY")$value))|> 
  tidytable::bind_rows(cbind.data.frame(hash_key= reshape2::melt(inconsistent_hashs_c2, id="HASH_KEY")$HASH_KEY, source_birth= as.character(reshape2::melt(inconsistent_hashs_c2, id="HASH_KEY")$variable), birth_date= reshape2::melt(inconsistent_hashs_c2, id="HASH_KEY")$value))|> 
  tidytable::bind_rows(cbind.data.frame(hash_key= reshape2::melt(inconsistent_hashs_c3, id="HASH_KEY")$HASH_KEY, source_birth= as.character(reshape2::melt(inconsistent_hashs_c3, id="HASH_KEY")$variable), birth_date= reshape2::melt(inconsistent_hashs_c3, id="HASH_KEY")$value))|>   
  tidytable::bind_rows(cbind.data.frame(hash_key= reshape2::melt(inconsistent_hashs_c4, id="HASH_KEY")$HASH_KEY, source_birth= as.character(reshape2::melt(inconsistent_hashs_c4, id="HASH_KEY")$variable), birth_date= reshape2::melt(inconsistent_hashs_c4, id="HASH_KEY")$value))|>     
  tidytable::bind_rows(cbind.data.frame(hash_key= reshape2::melt(inconsistent_hashs_c5, id="HASH_KEY")$HASH_KEY, source_birth= as.character(reshape2::melt(inconsistent_hashs_c5, id="HASH_KEY")$variable), birth_date= reshape2::melt(inconsistent_hashs_c5, id="HASH_KEY")$value))|>        tidytable::bind_rows(cbind.data.frame(hash_key= reshape2::melt(inconsistent_hashs_c6, id="HASH_KEY")$HASH_KEY, source_birth= as.character(reshape2::melt(inconsistent_hashs_c6, id="HASH_KEY")$variable), birth_date= reshape2::melt(inconsistent_hashs_c6, id="HASH_KEY")$value))|> 
  tidytable::bind_rows(cbind.data.frame(hash_key= reshape2::melt(inconsistent_hashs_mortality, id="hashkey")$hashkey, source_birth= as.character(reshape2::melt(inconsistent_hashs_mortality, id="hashkey")$variable), birth_date= reshape2::melt(inconsistent_hashs_mortality, id="hashkey")$value))|>
  tidytable::arrange(hash_key, source_birth)|> 
  tidytable::filter(!is.na(birth_date))|> 
  tidytable::left_join(distinct(subset(SISTRAT23_c1_2010_2024_df_prev1c, select=c("hash_key", "adm_date_rec")), hash_key, .keep_all = T), by="hash_key")|> tidytable::mutate(adm_age= lubridate::time_length(lubridate::interval(birth_date, adm_date_rec), unit="year")) |> tidytable::mutate(adm_age= tidytable::case_when(adm_age<16~"Less16", adm_age>90~"More90", T~""))

inconsistent_hashs_h_to_m_long$adm_date_rec<-NULL
#inconsistent_hashs_c1_2324   inconsistent_hashs_top_2324   inconsistent_hashs_c2_2224

#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:
invisible("Export database to explore it")
wdpath<-
paste0(gsub("/cons","",gsub("cons","",paste0(getwd(),"/cons"))))
envpath<- if(regmatches(wdpath, regexpr("[A-Za-z]+", wdpath))=="G"){"G:/Mi unidad/Alvacast/SISTRAT 2023/"}else{"E:/Mi unidad/Alvacast/SISTRAT 2023/"}
envpath
rio::export(file=paste0(wdpath,"cons/_out/inconsistent_birthdates_long_25.csv"),inconsistent_hashs_h_to_m_long)
invisible("sample")
inconsistent_hashs_h_to_m_long|>
    tidytable::filter(hash_key %in% dplyr::pull(sample_n_with_seed(data.frame(hashs_inconsistent_birth_dates),100, seed=2125),1)) |> 
    rio::export(file=paste0(wdpath,"cons/_out/inconsistent_birthdates_long_sample_25.csv"))
[1] "G:/Mi unidad/Alvacast/SISTRAT 2023/"

If there are records with inconsistent birth dates across multiple data sources but lack external information, we enriched these records with birth date information from the PO Office database. The analysis calculated the difference between original and PO birth dates, and estimated age at admission. The goal was to resolve birth date inconsistencies using an alternative authoritative source when external validation was unavailable.

Code
inconsistent_hashs_h_to_m_no_ext<-
SISTRAT23_c1_2010_2024_df_prev1c|>
  tidytable::filter(hash_key %in% hashs_inconsistent_birth_dates)|>
  tidylog::left_join(inconsistent_hashs_hosp_avg, by=c("hash_key"="run"))|>
  tidytable::select(hash_key, birth_date, h_avg_birth_date)|>
  tidylog::left_join(inconsistent_hashs_top, by=c("hash_key"="HASH_KEY"))|>
  tidylog::left_join(inconsistent_hashs_c2, by=c("hash_key"="HASH_KEY"))|> 
  tidylog::left_join(inconsistent_hashs_c3, by=c("hash_key"="HASH_KEY"))|> 
  tidylog::left_join(inconsistent_hashs_c4, by=c("hash_key"="HASH_KEY"))|> 
  tidylog::left_join(inconsistent_hashs_c5, by=c("hash_key"="HASH_KEY"))|>
  tidylog::left_join(inconsistent_hashs_c6, by=c("hash_key"="HASH_KEY"))|> 
  tidylog::left_join(inconsistent_hashs_mortality, by=c("hash_key"="hashkey"))|>
  tidytable::mutate_rowwise(
    non_NA_count = rowSums(!is.na(across(h_avg_birth_date:m_birthdate)))
  ) |> 
  dplyr::filter(non_NA_count==0)|> 
      (\(df) {
        (message(paste0("Inconsistent birth date that did not have external birth dates, Entries: ", nrow(df))))
        (message(paste0("Inconsistent birth date that did not have external birth dates, RUNs: ", tidytable::distinct(df, hash_key) |> nrow())))
        df
    })() 

left_join: added one column (h_avg_birth_date) > rows only in tidytable::filter(SISTR.. 3,489 > rows only in inconsistent_hashs_hosp.. ( 0) > matched rows 8,663 > ======== > rows total 12,152 left_join: added 3 columns (t_fechnac_1, t_fechnac_2, t_fechnac_3) > rows only in tidytable::select(tidyl.. 3,885 > rows only in inconsistent_hashs_top ( 0) > matched rows 8,267 > ======== > rows total 12,152 left_join: added 3 columns (c2_fechnac_1, c2_fechnac_2, c2_fechnac_3) > rows only in tidylog::left_join(tidy.. 12,061 > rows only in inconsistent_hashs_c2 ( 0) > matched rows 91 > ======== > rows total 12,152 left_join: added 2 columns (c3_fechnac_1, c3_fechnac_2) > rows only in tidylog::left_join(tidy.. 11,946 > rows only in inconsistent_hashs_c3 ( 0) > matched rows 206 > ======== > rows total 12,152 left_join: added one column (c4_fechnac_1) > rows only in tidylog::left_join(tidy.. 12,061 > rows only in inconsistent_hashs_c4 ( 0) > matched rows 91 > ======== > rows total 12,152 left_join: added one column (c5_fechnac_1) > rows only in tidylog::left_join(tidy.. 12,128 > rows only in inconsistent_hashs_c5 ( 0) > matched rows 24 > ======== > rows total 12,152 left_join: added one column (c6_fechnac_1) > rows only in tidylog::left_join(tidy.. 12,091 > rows only in inconsistent_hashs_c6 ( 0) > matched rows 61 > ======== > rows total 12,152 left_join: added one column (m_birthdate) > rows only in tidylog::left_join(tidy.. 11,679 > rows only in inconsistent_hashs_mort.. ( 0) > matched rows 473 > ======== > rows total 12,152 Inconsistent birth date that did not have external birth dates, Entries: 1097

Inconsistent birth date that did not have external birth dates, RUNs: 445

Code
#Inconsistent birth date that did not have external birth dates, Entries: 967
#Inconsistent birth date that did not have external birth dates, RUNs: 397
# Aug 2025
#Inconsistent birth date that did not have external birth dates, Entries: 1097
#Inconsistent birth date that did not have external birth dates, RUNs: 445


invisible("Get PO Office for inconsistent dates")
PO_brith_dates_for_inconsistent_dates  <-  
OLD_NEW_SISTRAT23_c1_2010_2024_df2|>
  tidylog::right_join(Base_fiscalia_v2, by=c("HASH_KEY.y"="rut_enc_saf"))|> 
  tidytable::select("HASH_KEY.x","HASH_KEY.y", "sexo.y","avg_birth_date_po")|> 
  tidytable::filter(HASH_KEY.x %in% hashs_inconsistent_birth_dates)|>
    (\(df) {
        (message(paste0("PO Office, Entries: ", nrow(df))))
        (message(paste0("PO Office, RUNs: ", tidytable::distinct(df, HASH_KEY.x)|> nrow())))
        df
    })()|>  
    tidytable::distinct(HASH_KEY.x, avg_birth_date_po)|>
    tidytable::ungroup()

right_join: added 5 columns (sexo.x, fec_nacimiento_simple, sexo.y, avg_birth_date_po, n_dis_birth_date_po) > rows only in OLD_NEW_SISTRAT23_c1_20.. ( 56,867) > rows only in Base_fiscalia_v2 30,256 > matched rows 1,164,249 (includes duplicates) > =========== > rows total 1,194,505 PO Office, Entries: 119831

PO Office, RUNs: 3435

Code
#PO Office, Entries: 120446
#PO Office, RUNs: 3288
#Aug 2025
#PO Office, Entries: 119831
#PO Office, RUNs: 3435

PO_brith_dates_for_inconsistent_dates_alt  <-  
OLD_NEW_SISTRAT23_c1_2010_2024_df2_alt|>
  tidylog::right_join(Base_fiscalia_v2, by=c("HASH_KEY"="rut_enc_saf"))|> 
  tidytable::select("HASH_KEY","HASH_KEY_target", "sexo.y","avg_birth_date_po")|> 
  tidytable::filter(HASH_KEY %in% hashs_inconsistent_birth_dates)|>
    (\(df) {
        (message(paste0("PO Office (alt., Aug 2025, not deterministically matched), Entries: ", nrow(df))))
        (message(paste0("PO Office (alt., Aug 2025, not deterministically matched), RUNs: ", tidytable::distinct(df, HASH_KEY)|> nrow())))
        df
    })()|>  
    tidytable::distinct(HASH_KEY, avg_birth_date_po)|>
    tidytable::ungroup()

right_join: added 5 columns (sexo.x, fec_nacimiento_simple, sexo.y, avg_birth_date_po, n_dis_birth_date_po) > rows only in OLD_NEW_SISTRAT23_c1_20.. ( 5,545) > rows only in Base_fiscalia_v2 554,407 > matched rows 0 > ========= > rows total 554,407 PO Office (alt., Aug 2025, not deterministically matched), Entries: 0

PO Office (alt., Aug 2025, not deterministically matched), RUNs: 0

Code
#Aug 2025
#PO Office, Entries: 0
#PO Office, RUNs: 0

invisible("Joined hashs with inconsistent birth dates with no external data")
invisible("with info of PO Office")
inconsistent_hashs_h_to_m_no_ext_long<-
inconsistent_hashs_h_to_m_no_ext|>
    tidytable::group_by(hash_key)|> 
    tidytable::mutate(source_birth= dplyr::row_number())|> 
    tidytable::ungroup()|> 
    tidytable::select(hash_key, source_birth, everything())|> 
    tidytable::mutate(source_birth= paste0("orig_",source_birth))|> 
    tidytable::select(hash_key, source_birth, birth_date)|> 
    tidytable::left_join(PO_brith_dates_for_inconsistent_dates, by=c("hash_key"="HASH_KEY.x"))|>
    tidytable::left_join(PO_brith_dates_for_inconsistent_dates_alt, by=c("hash_key"="HASH_KEY"))|>
  tidytable::arrange(hash_key, source_birth)|> 
  tidytable::filter(!is.na(birth_date))|> 
  tidytable::left_join(distinct(subset(SISTRAT23_c1_2010_2024_df_prev1c, select=c("hash_key", "adm_date_rec")), hash_key, .keep_all = T), by="hash_key")|> tidytable::mutate(adm_age= lubridate::time_length(lubridate::interval(birth_date, adm_date_rec), unit="year"), diff = abs(as.numeric(difftime(birth_date, avg_birth_date_po.x, units = "days"))))
Code
plot_inconsistent_ages_flowchart <- DiagrammeR::grViz("
digraph {
  graph [layout = dot, rankdir = TB]
  # Global node attributes
  node [shape = box, fontname = Helvetica]
  # Nodes
  start [label = 'Start (2.1.1)', shape = circle]
  filter_age [label = 'Filter birth dates with year of admission at <16 or >90\\nunless dates are consistent within ±2 years with other birth dates']
  check_year_diff [label = 'Calculate year difference within patients']
  year_diff_na [label = 'Is difference in years missing?', shape=diamond]
  log_error [label = 'Log error (year_diff is NA)']
  decision_year_diff [label = 'Is difference in\nyears > 2?', shape=diamond]
  discard_inconsistent [label = '2.1.1.a.Discard entries with\ninconsistent dates']
  count_orig [label = 'Count most frequent birth dates within C1 source']
  one_common_date [label = 'Is there exactly one\nmost common date in C1 source?', shape=diamond]
  multiple_common_dates [label = 'Are there multiple most common\ndates in C1 source?', shape=diamond]
  no_common_date [label = 'No common dates\nin C1 source']
  calculate_avg [label = 'Calculate average of\nmost common dates overall']
  select_closest [label = 'Select closest date\namong C1 dates to average']
  flag_inconsistent [label = 'Flag record (no\nvalid birth date determined)']
  assign_selected_date [label = 'Assign selected_birth_date']
  end [label = 'End', shape = circle]
  # Set Start and End to be on the same horizontal rank
  # Define subgraph for top and bottom alignment of Start and End
  { rank = min; start; }
  { rank = max; end; }
  # Connections
  start -> filter_age
  filter_age -> check_year_diff
  check_year_diff -> year_diff_na
  year_diff_na -> log_error [label = 'Yes']
  log_error -> count_orig
  year_diff_na -> decision_year_diff [label = 'No']
  decision_year_diff -> discard_inconsistent [label = 'Yes']
  discard_inconsistent -> count_orig
  decision_year_diff -> count_orig [label = 'No']
  count_orig -> one_common_date
  one_common_date -> assign_selected_date [label = 'Yes (2.1.1.b)']
  assign_selected_date -> end
  one_common_date -> multiple_common_dates [label = 'No']
  multiple_common_dates -> calculate_avg [label = 'Yes (2.1.1.c)']
  multiple_common_dates -> no_common_date [label = 'No']
  no_common_date -> flag_inconsistent
  flag_inconsistent -> end
  calculate_avg -> select_closest
  select_closest -> assign_selected_date
}
",
width = 800,
height = 900)

plot_inconsistent_ages_flowchart

#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:
invisible("Export database to explore it")
wdpath<-
paste0(gsub("/cons","",gsub("cons","",paste0(getwd(),"/cons"))))
envpath<- if(regmatches(wdpath, regexpr("[A-Za-z]+", wdpath))=="G"){"G:/Mi unidad/Alvacast/SISTRAT 2023/"}else{"E:/Mi unidad/Alvacast/SISTRAT 2023/"}
envpath
# WidthCM<-8
# HeightCM<-6
# DPI<-600
unlink(paste0(wdpath,"cons/_figs/inconsistent_birthdates_flowchart_files"), recursive = TRUE)

htmlwidgets::saveWidget(plot_inconsistent_ages_flowchart, paste0(wdpath,"cons/_figs/inconsistent_birthdates_flowchart.html"))
webshot::webshot(paste0(wdpath,"cons/_figs/inconsistent_birthdates_flowchart.html"),paste0(wdpath,"cons/_figs/inconsistent_birthdates_flowchart.png"), vwidth = 300*1.2, vheight = 300,  zoom=10, expand=100)  # Prueba con diferentes coordenadas top, left, width, and height.

Registered S3 methods overwritten by ‘callr’: method from format.callr_status_error
print.callr_status_error

Workflow for rule-based selection of consistent birth dates (w/ext. data)

Workflow for rule-based selection of consistent birth dates (w/ext. data)

Code
#https://stackoverflow.com/questions/1554635/graphviz-how-to-have-a-subgraph-be-left-to-right-when-main-graph-is-top-to-bot
#https://stackoverflow.com/questions/65509087/diagrammer-flowchart-align-vertical-nodes
#https://stackoverflow.com/questions/39451158/how-to-specify-vertical-alignment-of-nodes-in-r-package-diagrammer
#https://stackoverflow.com/questions/64323943/graphviz-and-dot-files-horizontal-and-vertical-node-alignment-intervening-node
#https://stackoverflow.com/questions/5424555/changing-edge-direction-in-dot
#https://graphviz.org/docs/attrs/rankdir/

Workflow for rule-based selection of consistent birth dates (w/ext. data)

[1] "G:/Mi unidad/Alvacast/SISTRAT 2023/"
Code
invisible("1. <16 & >90 (adm_age), might be wrong ages ")
invisible("2. mfv within c1, if there is only one birth date with rank 1, that is, with no ties in the most frequent value")
invisible("3. closeness of one of candidate birthdates to the external value")
invisible("4. retrieval year")
invisible("3. Closest date to external average selected due to tie")
invisible("5. mean?")
invisible("(Some of the external dbs are not reliable due to imprecise quantity in the birth date)")

error_log <- data.frame(
  hash_key = character(),
  orig_mean = numeric(),
  non_orig_mean = numeric(),
  year_diff = numeric(),
  issue_description = character(),
  stringsAsFactors = FALSE
)
group_1690_log <- data.frame(
  hash_key = character(),
  desc = character(),
  stringsAsFactors = FALSE
)
group_mfv1_log <- data.frame(
  hash_key = character(),
  desc = character(),
  stringsAsFactors = FALSE
)
group_mfv2_log <- data.frame(
  hash_key = character(),
  desc = character(),
  stringsAsFactors = FALSE
)

invisible("1. discard birth dates with adm_age at <16 & >90 , unless every other birth dates are similar (-/+2 yrs) or lower 2. select the most frequent value within orig_ birth dates (source_birth) as long as there is more than 1 case in each frequent value; if there is only one birth date with rank 1, that is, with no ties in the most frequent value 3. if there are 2 ties or more, select the value within orig_ in source_birth which is closer to an external date or the average of the external dates 4. if 1, 2,3 did not apply, flag them in a new column")

# Define a function to process each 'hash_key' group
process_birthdates <- function(group, hash_key) {
  library(dplyr)
  library(lubridate)
  hash_key <- unique(group$hash_key)
  #Remove extremely implausible birth dates
  #Not necessary because of the first filter
  # group <- group %>%
  #     mutate(
  #       birth_date = ifelse(
  #         birth_date <= as.Date("1910-01-01") | birth_date >= as.Date("2010-01-01"),
  #         NA,
  #         birth_date
  #       )
  #     )
  # Filter out "Less16" and "More90" unless dates within group are consistent within ±2 years
  # Ver grupos con alguna edad al ingreso aberrante, tomar el máximo de ese birth date vs. el mínimo. Si la diferencia entre el mínimo y máximo es mayor a 2, sacar esa edad
  # AGERGAMOS DONDE LA FUENTE TENGA ORIG
  # Filter out "Less16" and "More90" unless dates within group are consistent within ±2 years
  if(any(group$adm_age %in% c("Less16", "More90"))) {
    # Calculate year difference between 'orig' and non-'orig' entries
    orig_mean <- mean(year(group$birth_date[group$adm_age %in% c("Less16", "More90")]), na.rm = TRUE)
    non_orig_mean <- mean(year(group$birth_date[!group$adm_age %in% c("Less16", "More90")]), na.rm = TRUE)
    year_diff <- abs(orig_mean - non_orig_mean)
    
    # If year_diff is NA, log this case in the error_log data frame
    if(is.na(year_diff)) {
      error_log <<- rbind(error_log, data.frame(
        hash_key = hash_key,
        orig_mean = orig_mean,
        non_orig_mean = non_orig_mean,
        year_diff = year_diff,
        issue_description = "year_diff is NA due to missing values in orig or non-orig group",
        stringsAsFactors = FALSE
      ))
    }
    
    # Check that year_diff is not NA and |diff| in years >2 before filtering
    if(!is.na(year_diff) && year_diff > 2) {
      group <- group %>% dplyr::filter(!adm_age %in% c("Less16", "More90"))
      group_1690_log  <<- rbind(group_1690_log, data.frame(
        hash_key = hash_key,
        desc = "Less16|More90, removed rows due to >2 |diff|",
        stringsAsFactors = FALSE
      ))
    }
  }
  # Apply selection rules based on frequency and consistency 
  # If there is only one most common date among in C1, then this is the selected birth date 
  # if there are more most common dates, we borrow information from these other external birth dates
  # Get the most frequent birth date among original sources
  birth_counts <- table(group$birth_date[grepl("orig", group$source_birth)])
  if(length(birth_counts) > 0) {
    most_common_dates <- names(birth_counts)[which(birth_counts == max(birth_counts))]
  } else {
    most_common_dates <- NA
  }
  # 'Count most frequent birth dates within C1 source'
  # Decision: 'Is there exactly one most common date in C1 source?'

  # Get the most frequent birth date overall
  birth_counts_os <- table(group$birth_date)
  if(length(birth_counts_os) > 0) {
    most_common_dates_os <- names(birth_counts_os)[which(birth_counts_os == max(birth_counts_os))]
  } else {
    most_common_dates_os <- NA
  }
  
  # Apply selection rules based on frequency and consistency
  # If there is only one most common date among in C1, then this is the selected birth date= When there's exactly one most common date, the code assigns it as the selected_birth_date and sets the flag to FALSE.
  # if there are more most common dates, we borrow information from these other external birth dates
  if(length(most_common_dates) == 1) {
    group <- group %>%
      dplyr::mutate(sel_birth_date = most_common_dates[1], flag = FALSE)
    
    group_mfv1_log  <<- rbind(group_mfv1_log, data.frame(
        hash_key = hash_key,
        desc = "The most common date is selected as the birth date",
        stringsAsFactors = FALSE
      ))
    
  } else if(length(most_common_dates) >= 2) {
    avg_date <- as.Date(mean(as.numeric(as.Date(most_common_dates_os)), na.rm=T), origin = "1970-01-01")
    closest_date <- most_common_dates[which.min(abs(as.Date(most_common_dates) - avg_date))]
    group <- group %>%
      dplyr::mutate(sel_birth_date = closest_date, flag = FALSE)
    
    group_mfv2_log  <<- rbind(group_mfv2_log, data.frame(
        hash_key = hash_key,
        desc = "Multiple common dates found. Select the birth date closest to available external records",
        stringsAsFactors = FALSE
      ))
  } else {
    group <- group %>%
      dplyr::mutate(sel_birth_date = NA, flag = TRUE)
  }
  return(group)
}
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
invisible("2024-11-18: Add PO to the candidate brith dates")
invisible("Consolidated dataframe of birthdates")
proc_birthdates_ext_data <- 
  inconsistent_hashs_h_to_m_long|>
  tidylog::left_join(dplyr::distinct(inconsistent_hashs_may23_PO_office, HASH_KEY.x, avg_birth_date_po), by= c("hash_key"="HASH_KEY.x"), multiple="first")|> 
  dplyr::filter(!is.na(avg_birth_date_po))|> 
  dplyr::select(hash_key, avg_birth_date_po)|> 
  dplyr::mutate(source_birth = "avg_birth_date_po", birth_date = avg_birth_date_po, adm_age = "")|> 
  dplyr::select(-avg_birth_date_po)|> 
  dplyr::distinct(hash_key, birth_date)|> 
  dplyr::mutate(adm_age="", source_birth="avg_po_date")|> 
  dplyr::select(hash_key, source_birth, birth_date, adm_age)|> 
   (\(df) {
  dplyr::bind_rows(inconsistent_hashs_h_to_m_long, df)
    })()|>  
   dplyr::arrange(hash_key, source_birth)|>   
  dplyr::group_split(hash_key)|>
  purrr::map_dfr(~ process_birthdates(.x))|> 
  tidylog::left_join(group_1690_log, by="hash_key", multiple="first")|> 
  tidylog::left_join(group_mfv1_log, by="hash_key", multiple="first")|> 
  tidylog::left_join(group_mfv2_log, by="hash_key", multiple="first")|> 
  dplyr::rename("obs1"="desc.x","obs2"="desc.y","obs3"="desc")|> 
  dplyr::mutate(obs1= ifelse(!is.na(obs1),paste0("2.1.1.a.",obs1),obs1))|>
  dplyr::mutate(obs2= ifelse(!is.na(obs2),paste0("2.1.1.b.",obs2),obs2))|>
  dplyr::mutate(obs3= ifelse(!is.na(obs3),paste0("2.1.1.c.",obs3),obs3))|>
  dplyr::mutate(obs = purrr::pmap_chr(list(obs1, obs2, obs3), ~ paste(na.omit(c(...)), collapse = " ;")))

left_join: added one column (avg_birth_date_po) > rows only in inconsistent_hashs_h_to.. 2,163 > rows only in dplyr::distinct(inconsi.. ( 397) > matched rows 15,282 > ======== > rows total 17,445

Adjuntando el paquete: ‘dplyr’

The following objects are masked from ‘package:tidytable’:

across, add_count, add_tally, anti_join, arrange, between,
bind_cols, bind_rows, c_across, case_match, case_when, coalesce,
consecutive_id, count, cross_join, cume_dist, cur_column, cur_data,
cur_group_id, cur_group_rows, dense_rank, desc, distinct, filter,
first, full_join, group_by, group_cols, group_split, group_vars,
if_all, if_any, if_else, inner_join, is_grouped_df, lag, last,
lead, left_join, min_rank, mutate, n, n_distinct, na_if, nest_by,
nest_join, nth, percent_rank, pick, pull, recode, reframe,
relocate, rename, rename_with, right_join, row_number, rowwise,
select, semi_join, slice, slice_head, slice_max, slice_min,
slice_sample, slice_tail, summarise, summarize, tally, top_n,
transmute, tribble, ungroup

The following objects are masked from ‘package:stats’:

filter, lag

The following objects are masked from ‘package:base’:

intersect, setdiff, setequal, union

Adjuntando el paquete: ‘lubridate’

The following objects are masked from ‘package:base’:

date, intersect, setdiff, union

left_join: added one column (desc) > rows only in purrr::map_dfr(dplyr::g.. 18,734 > rows only in group_1690_log ( 0) > matched rows 1,365 > ======== > rows total 20,099 left_join: added 2 columns (desc.x, desc.y) > rows only in tidylog::left_join(purr.. 7,498 > rows only in group_mfv1_log ( 0) > matched rows 12,601 > ======== > rows total 20,099 left_join: added one column (desc) > rows only in tidylog::left_join(tidy.. 12,601 > rows only in group_mfv2_log ( 0) > matched rows 7,498 > ======== > rows total 20,099

Code
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_

message(paste0("Number of cases in error log: ", nrow(error_log)))

Number of cases in error log: 0

Code
if(nrow(error_log)>0){
#a534782e4a670be9fe584f9b1b47a7f38a73f6b08f18aa7a7b4a45a7ba628dbf
inconsistent_hashs_h_to_m_long|>
  dplyr::filter(hash_key %in% error_log$hash_key) |> 
  knitr::kable("markdown", caption="HASHs with no records >16|<90 yrs old at admission to treatment")
}
cat("Flag record (no valid birth date determined)\n")

invisible("Flags")
message(paste0("Number of flags= ",nrow(subset(proc_birthdates_ext_data,flag==TRUE))))

Number of flags= 0

Code
# Casos simples
cases <- c(
  "2.1.1.a." = "Discard entries with inconsistent dates (>2y within group)",
  "2.1.1.b." = "Exactly one most common date in C1 source",
  "2.1.1.c." = "Select closest date among C1 dates to average"
)

purrr::iwalk(cases, function(desc, code){
  pat <- gsub("\\.", "\\\\.", code)  # escapar puntos
  df <- proc_birthdates_ext_data |> dplyr::filter(grepl(pat, obs))
  cat(code, "= ", desc, "\n", sep = "")
  message(code, " Entries: ", nrow(df))
  message(code, " RUNs: ", dplyr::n_distinct(df$hash_key))
})

2.1.1.a. Entries: 1365 2.1.1.a. RUNs: 296 2.1.1.b. Entries: 12601 2.1.1.b. RUNs: 1900 2.1.1.c. Entries: 7498 2.1.1.c. RUNs: 1638

Code
# Intersecciones (A & B, A & C, B & C)
pairs <- list(
  c("2.1.1.a.", "2.1.1.b."),
  c("2.1.1.a.", "2.1.1.c."),
  c("2.1.1.b.", "2.1.1.c.")
)

purrr::walk(pairs, function(codes){
  pats <- vapply(codes, function(code) gsub("\\.", "\\\\.", code), character(1))
  df <- proc_birthdates_ext_data |>
    dplyr::filter(grepl(pats[1], obs) & grepl(pats[2], obs))
  label <- paste(codes, collapse = " & ")
  message(label, " Entries: ", nrow(df))
  message(label, " RUNs: ", dplyr::n_distinct(df$hash_key))
})

2.1.1.a. & 2.1.1.b. Entries: 1325 2.1.1.a. & 2.1.1.b. RUNs: 287 2.1.1.a. & 2.1.1.c. Entries: 40 2.1.1.a. & 2.1.1.c. RUNs: 9 2.1.1.b. & 2.1.1.c. Entries: 0 2.1.1.b. & 2.1.1.c. RUNs: 0

Code
# 2.1.1.a., Entries: 1274
# 2.1.1.a., RUNs: 283
# 2.1.1.b., Entries: 11488
# 2.1.1.b., RUNs: 1748
# 2.1.1.c., Entries: 7255
# 2.1.1.c., RUNs: 1568
# 2.1.1.a. & 2.1.1.b., Entries: 1238
# 2.1.1.a. & 2.1.1.b., RUNs: 275
# 2.1.1.a. & 2.1.1.c., Entries: 36
# 2.1.1.a. & 2.1.1.c., RUNs: 8
# 2.1.1.b. & 2.1.1.c., Entries: 0
# 2.1.1.b. & 2.1.1.c., RUNs: 0

# Aug 2025
#2.1.1.a.= Discard entries with inconsistent dates (>2y within group)
# 2.1.1.a., Entries: 1365
# 2.1.1.a., RUNs: 296
#2.1.1.b.= Exactly one most common date in C1 source
# 2.1.1.b., Entries: 12601
# 2.1.1.b., RUNs: 1900
#2.1.1.c.= Select closest date among C1 dates to average
# 2.1.1.c., Entries: 7498
# 2.1.1.c., RUNs: 1638
# 2.1.1.a. & 2.1.1.b., Entries: 1325
# 2.1.1.a. & 2.1.1.b., RUNs: 287
# 2.1.1.a. & 2.1.1.c., Entries: 40
# 2.1.1.a. & 2.1.1.c., RUNs: 9
# 2.1.1.b. & 2.1.1.c., Entries: 0
# 2.1.1.b. & 2.1.1.c., RUNs: 0


wdpath<-
paste0(gsub("/cons","",gsub("cons","",paste0(getwd(),"/cons"))))
envpath<- if(regmatches(wdpath, regexpr("[A-Za-z]+", wdpath))=="G"){"G:/Mi unidad/Alvacast/SISTRAT 2023/"}else{"E:/Mi unidad/Alvacast/SISTRAT 2023/"}
envpath

invisible("manual review of criteria to see consistency")
proc_birthdates_ext_data|>
    tidytable::filter(hash_key %in% dplyr::pull(sample_n_with_seed(data.frame(hashs_inconsistent_birth_dates),100, seed=2125),1))|> 
    rio::export(file=paste0(wdpath,"cons/_out/inconsistent_birthdates_long_sample25.csv"))
#every entry has less 16 years old
Flag record (no valid birth date determined)
2.1.1.a.= Discard entries with inconsistent dates (>2y within group)
2.1.1.b.= Exactly one most common date in C1 source
2.1.1.c.= Select closest date among C1 dates to average
[1] "G:/Mi unidad/Alvacast/SISTRAT 2023/"

2.1.2. Inconsistent birth dates with no external data

However, there were cases with no external data. In these cases, we discarded birth dates prior to 1910 and earlier than 2010.

Code
plot_inconsistent_ages_noext_flowchart <- DiagrammeR::grViz("
digraph {
  graph [layout = dot, rankdir = TB]

  # Global node attributes
  node [shape = box, fontname = Helvetica]

  # Nodes
  start [label = 'Start', shape = circle]
  discard_dates [label = 'Discard birth dates prior to 1910-01-01\\nor after 2010-01-01']
  check_valid_dates [label = 'Any valid birth dates remaining?', shape = diamond]
  no_valid_dates [label = 'Set sel_birth_date to NA\\nFLAG: No valid birth dates']
  avg_birth_date_po_available [label = 'Is any birth date from PO available?', shape = diamond]
  select_closest_to_po [label = '2.1.2.a.Select date closest to PO']
  create_frequency_table [label = 'Create frequency table of birth_dates']
  single_most_frequent [label = 'Is there a single most frequent date?', shape = diamond]
  select_most_frequent [label = '2.1.2.b.Select most frequent date']
  tie_mfv_twoyrs [label = 'Differences greater than\n2 years between ties?', shape = diamond]
  tie_most_frequent [label = '2.1.2.c.1.Ties among most\nfrequent dates with\ndifferences > 2 years\\nFLAG: Unresolved inconsistencies']
  tie_most_frequent2 [label = '2.1.2.c.2.Ties among most\nfrequent dates with\ndifferences <= 2 years']
  end [label = 'End', shape = circle]

  # Connections
  start -> discard_dates
  discard_dates -> check_valid_dates
  check_valid_dates -> no_valid_dates [label = 'No']
  #no_valid_dates -> end
  check_valid_dates -> avg_birth_date_po_available [label = 'Yes']
  avg_birth_date_po_available -> select_closest_to_po [label = 'Yes']
  select_closest_to_po -> end
  avg_birth_date_po_available -> create_frequency_table [label = 'No']
  create_frequency_table -> single_most_frequent
  single_most_frequent -> select_most_frequent [label = 'Yes']
  select_most_frequent -> end
  single_most_frequent -> tie_mfv_twoyrs [label = 'No']
  tie_mfv_twoyrs -> tie_most_frequent [label = 'No']
  tie_mfv_twoyrs -> tie_most_frequent2 [label = 'Yes']
  tie_most_frequent2 -> end
}
",
width = 800,
height = 900)

# desc = "Less16|More90, removed rows due to >2 |diff|",
# desc = "The most common date is selected as the birth date",
# desc = "Multiple common dates found. Select the birth date closest to available external records",

plot_inconsistent_ages_noext_flowchart

#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:
invisible("Export database to explore it")
wdpath<-
paste0(gsub("/cons","",gsub("cons","",paste0(getwd(),"/cons"))))
envpath<- if(regmatches(wdpath, regexpr("[A-Za-z]+", wdpath))=="G"){"G:/Mi unidad/Alvacast/SISTRAT 2023/"}else{"E:/Mi unidad/Alvacast/SISTRAT 2023/"}
envpath
# WidthCM<-8
# HeightCM<-6
# DPI<-600
unlink(paste0(wdpath,"cons/_figs/inconsistent_ages_noext_flowchart_files"), recursive = TRUE)

htmlwidgets::saveWidget(plot_inconsistent_ages_noext_flowchart, paste0(wdpath,"cons/_figs/inconsistent_ages_noext_flowchart.html"))
webshot::webshot(paste0(wdpath,"cons/_figs/inconsistent_ages_noext_flowchart.html"),paste0(wdpath,"cons/_figs/inconsistent_ages_noext_flowchart.png"), vwidth = 300*1.2, vheight = 300,  zoom=10, expand=100)  # Prueba con diferentes coordenadas top, left, width, and height.
Workflow for rule-based selection of consistent birth dates (No ext. data)

Workflow for rule-based selection of consistent birth dates (No ext. data)

Code
#https://stackoverflow.com/questions/1554635/graphviz-how-to-have-a-subgraph-be-left-to-right-when-main-graph-is-top-to-bot
#https://stackoverflow.com/questions/65509087/diagrammer-flowchart-align-vertical-nodes
#https://stackoverflow.com/questions/39451158/how-to-specify-vertical-alignment-of-nodes-in-r-package-diagrammer
#https://stackoverflow.com/questions/64323943/graphviz-and-dot-files-horizontal-and-vertical-node-alignment-intervening-node
#https://stackoverflow.com/questions/5424555/changing-edge-direction-in-dot
#https://graphviz.org/docs/attrs/rankdir/

Workflow for rule-based selection of consistent birth dates (No ext. data)

[1] "G:/Mi unidad/Alvacast/SISTRAT 2023/"
Code
invisible("Discard birth dates previous or equal to 1910-01-01, or posterior to 2010-01-01")
invisible("IF PO available, select the date closer to PO in sel_birth_date")
invisible("If there is no PO available, select the most frequent date")
invisible("If ties among most frequent dates, FLAG it column FLAG as unresolvable inconsistency and return an NA in sel_birth_date")

invisible("IF PO available, select the date closer to PO in sel_birth_date")

# Function to process each group
process_group_noextinfo <- function(df_group) {

  # Discard birth dates prior to 1910-01-01 or after 2010-01-01
  df_group <- df_group|>
    mutate(
      birth_date = if_else( #requested in 2025-03-19
        birth_date <= as.Date("1910-01-01") | birth_date >= as.Date("2010-01-01"),
        as.Date(NA_character_),
        birth_date
      )
    )
  
  # Initialize 'sel_birth_date', 'FLAG', and 'obs'
  df_group <- df_group|>
    mutate(
      sel_birth_date = as.Date(NA),
      FLAG = NA,
      obs = NA
    )
  
  # Extract necessary values
  avg_birth_date_po <- unique(df_group$avg_birth_date_po[!is.na(df_group$avg_birth_date_po)])
  
  # Remove NA values from birth_dates
  birth_dates <- df_group$birth_date[!is.na(df_group$birth_date)]

  # Rule 1: If 'avg_birth_date_po' is available, select the date closest to it
  #Error en !is.na(avg_birth_date_po) && length(birth_dates) > 0: 'length = 2' in coercion to 'logical(1)'
  if (length(avg_birth_date_po)>0 && length(birth_dates) > 0) {
  # Rule 1: If 'avg_birth_date_po' is available, select the date closest to it
  #if (!is.na(avg_birth_date_po) && length(birth_dates) > 0) {
    # Calculate differences between birth_dates and avg_birth_date_po
    diffs <- abs(as.numeric(difftime(birth_dates, avg_birth_date_po, units = "days")))
    diffs[is.na(diffs)] <- Inf
    
    if (all(is.infinite(diffs))) {
      # All diffs are NA or Inf
      df_group <- df_group |>
        mutate(
          sel_birth_date = as.Date(NA),
          obs = "2.1.2.0.All diffs are NA or Inf",
          FLAG = "Unable to select date closest to PO"
        )
      return(df_group)
    }
    
    sel_date <- birth_dates[which.min(diffs)]
    df_group <- df_group |>
      mutate(
        sel_birth_date = sel_date,
        obs = "2.1.2.a.Selected date closest to PO",
        FLAG = NA
      )
    return(df_group)
  }
  # Rule 2: If no 'avg_birth_date_po', select the most frequent date
  if (length(birth_dates) > 0) {
    date_counts <- table(birth_dates)
    max_count <- max(date_counts)
    most_freq_dates <- as.Date(names(date_counts[date_counts == max_count]))
    #most_freq_dates <- base::as.Date(names(date_counts[date_counts == max_count]), format = "%Y-%m-%d")

    if (length(most_freq_dates) == 1) {
      # No tie, select the most frequent date
      sel_date <- most_freq_dates
      df_group <- df_group|>
        dplyr::mutate(
          sel_birth_date = sel_date,
          obs = "2.1.2.b.Selected most frequent date",
          FLAG = NA
        )
      return(df_group)
    } else {
      # Tie among most frequent dates
      # New Rule: Check if differences among tied dates are > 2 years
      max_diff_days <- as.numeric(max(most_freq_dates) - min(most_freq_dates))
      if (max_diff_days > 730) {
        df_group <- df_group|>
          mutate(
            sel_birth_date = as.Date(NA),
            FLAG = "Unresolvable inconsistency: Ties among most frequent dates",
            obs = "2.1.2.c1.Ties among most frequent dates. Unresolvable inconsistency"
          )
        return(df_group)
      } else {
        # Differences <= 2 years, compute average date
        sel_date_numeric <- mean(as.numeric(most_freq_dates))
        sel_date <- base::as.Date(sel_date_numeric, origin = "1970-01-01")
        df_group <- df_group %>%
          mutate(
            sel_birth_date = sel_date,
            obs = "2.1.2.c2.Similar ties in dates, replaced with the average date",
            FLAG = NA
          )
        return(df_group) 
      }
    }
  } else {
    # No valid birth dates remaining
    df_group <- df_group|> 
      mutate(
        sel_birth_date = base::as.Date(NA),
        FLAG = "No valid birth dates",
        obs = "2.1.2.d.No valid birth dates"
      )
    return(df_group)
  }
}
    
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
# Apply the function to every hash_key

# inconsistent_hashs_h_to_m_no_ex

proc_inconsistent_hashs_h_to_m_no_ext_long <- 
  inconsistent_hashs_h_to_m_no_ext_long|> 
  #AUG 2025
  tidytable::mutate(avg_birth_date_po= ifelse(!is.na(avg_birth_date_po.y),avg_birth_date_po.y,avg_birth_date_po.x))|>
    (\(df) {
        print(cat("FLAG: Unresolved inconsistencies\n"))
        (message(paste0("Inconsistent birth dates with no external data, Entries: ", nrow(df))))
        (message(paste0("Inconsistent birth dates with no external data, RUNs: ", tidytable::distinct(df, hash_key)|> nrow())))
        df
    })()|>
  dplyr::group_by(hash_key)|> 
  dplyr::group_split()|> 
  purrr::map_dfr(~ process_group_noextinfo(.x))

Inconsistent birth dates with no external data, Entries: 1152

Inconsistent birth dates with no external data, RUNs: 445

Code
#Inconsistent birth dates with no external data, Entries: 1022
#Inconsistent birth dates with no external data, RUNs: 397
#Aug 2025
#Inconsistent birth dates with no external data, Entries: 1152
#Inconsistent birth dates with no external data, RUNs: 445
#Aviso: Unknown or uninitialised column: `avg_birth_date_po`.


# debug(process_group_noextinfo)
# 
# # run your problematic command:
# proc_inconsistent_hashs_h_to_m_no_ext_long <- inconsistent_hashs_h_to_m_no_ext_long |> 
#   dplyr::group_by(hash_key) |> 
#   dplyr::group_split() |> 
#   purrr::map_dfr(~ process_group_noextinfo(.x))
# 
# undebug(process_group_noextinfo)



#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
invisible("get numbers of resolutions")

proc_inconsistent_hashs_h_to_m_no_ext_long |>
  dplyr::filter(grepl("2\\.1\\.2\\.0\\.", obs)) |>
  (\(df){
    cat("All diffs are NA or Inf\n")
    message("2.1.2.0., Entries: ", nrow(df))
    message("2.1.2.0., RUNs: ", dplyr::n_distinct(df$hash_key))
  })()

2.1.2.0., Entries: 0

2.1.2.0., RUNs: 0

Code
proc_inconsistent_hashs_h_to_m_no_ext_long |>
  dplyr::filter(grepl("2\\.1\\.2\\.a\\.", obs)) |>
  (\(df){
    cat("2.1.2.a. Select date closest to PO\n")
    message("2.1.2.a., Entries: ", nrow(df))
    message("2.1.2.a., RUNs: ", dplyr::n_distinct(df$hash_key))
  })()

2.1.2.a., Entries: 997

2.1.2.a., RUNs: 379

Code
proc_inconsistent_hashs_h_to_m_no_ext_long |>
  dplyr::filter(grepl("2\\.1\\.2\\.b\\.", obs)) |>
  (\(df){
    cat("2.1.2.b. Select most frequent date\n")
    message("2.1.2.b., Entries: ", nrow(df))
    message("2.1.2.b., RUNs: ", dplyr::n_distinct(df$hash_key))
  })()

2.1.2.b., Entries: 54

2.1.2.b., RUNs: 17

Code
proc_inconsistent_hashs_h_to_m_no_ext_long |>
  dplyr::filter(grepl("2\\.1\\.2\\.c1\\.", obs)) |>
  (\(df){
    cat("2.1.2.c1. Ties among most frequent dates. Unresolvable inconsistency\n")
    message("2.1.2.c1., Entries: ", nrow(df))
    message("2.1.2.c1., RUNs: ", dplyr::n_distinct(df$hash_key))
  })()

2.1.2.c1., Entries: 26

2.1.2.c1., RUNs: 12

Code
proc_inconsistent_hashs_h_to_m_no_ext_long |>
  dplyr::filter(grepl("2\\.1\\.2\\.c2\\.", obs)) |>
  (\(df){
    cat("2.1.2.c2. Similar ties in dates, replaced with the average date\n")
    message("2.1.2.c2., Entries: ", nrow(df))
    message("2.1.2.c2., RUNs: ", dplyr::n_distinct(df$hash_key))
  })()

2.1.2.c2., Entries: 75

2.1.2.c2., RUNs: 37

Code
proc_inconsistent_hashs_h_to_m_no_ext_long |>
  dplyr::filter(grepl("2\\.1\\.2\\.d\\.", obs)) |>
  (\(df){
    cat("2.1.2.d. No valid birth dates\n")
    message("2.1.2.d., Entries: ", nrow(df))
    message("2.1.2.d., RUNs: ", dplyr::n_distinct(df$hash_key))
  })()

2.1.2.d., Entries: 0

2.1.2.d., RUNs: 0

Code
# 2.1.2.0., Entries: 0
# 2.1.2.0., RUNs: 0
# 2.1.2.a., Entries: 921
# 2.1.2.a., RUNs: 353
# 2.1.2.b., Entries: 30
# 2.1.2.b., RUNs: 10
# 2.1.2.c1., Entries: 26
# 2.1.2.c1., RUNs: 12
# 2.1.2.c2., Entries: 45
# 2.1.2.c2., RUNs: 22
# 2.1.2.d., Entries: 0
# 2.1.2.d., RUNs: 0
#AUG 2025
#All diffs are NA or Inf
#2.1.2.0., Entries: 0
#2.1.2.0., RUNs: 0
#2.1.2.a.Select date closest to PO
#2.1.2.a., Entries: 997
#2.1.2.a., RUNs: 379
#2.1.2.b.Select most frequent date
#2.1.2.b., Entries: 54
#2.1.2.b., RUNs: 17
#2.1.2.c1.Ties among most frequent dates. Unresolvable inconsistency
#2.1.2.c1., Entries: 26
#2.1.2.c1., RUNs: 12
#2.1.2.c2.Similar ties in dates, replaced with the average date
#2.1.2.c2., Entries: 75
#2.1.2.c2., RUNs: 37
#2.1.2.d.No valid birth dates
#2.1.2.d., Entries: 0
#2.1.2.d., RUNs: 0
FLAG: Unresolved inconsistencies
NULL
All diffs are NA or Inf
2.1.2.a. Select date closest to PO
2.1.2.b. Select most frequent date
2.1.2.c1. Ties among most frequent dates. Unresolvable inconsistency
2.1.2.c2. Similar ties in dates, replaced with the average date
2.1.2.d. No valid birth dates

As of August 2025, for cases with label “2.1.2.c1.Ties among most frequent dates. Unresolvable inconsistency” we imputed the most probable birth date based on "sexo", "tipo_centro", "tipo_de_plan", "pais_nacimiento", "se_trata_de_una_mujer_embarazada", "escolaridad_ultimo_ano_cursado", "sustancia_principal", "edad_inicio_sustancia_principal", "tiene_menores_de_edad_a_cargo", "edad_inicio_consumo", "numero_de_hijos", "estado_conyugal", "TABLE_rec2", and "numero_de_tratamientos_anteriores" using missRanger package. We computed the mean of imputed values across all available estimation methods, identified the original birth date with minimal absolute deviation from the group mean and assigned this consensus date to all records within the group.

Code
#hashs_invalid_adm_age

# We explored 3 types of imputation: using k-nearest neighbours, random forests and multiple imputation with chained equations. The variables used as candidates were: `"sexo"`, `"tipo_centro"`, `"tipo_de_plan"`, `"pais_nacimiento"`, `"se_trata_de_una_mujer_embarazada"`, `"escolaridad_ultimo_ano_cursado"`, `"sustancia_principal"`, `"edad_inicio_sustancia_principal"`, `"tiene_menores_de_edad_a_cargo"`, `"edad_inicio_consumo"`, `"numero_de_hijos"`, `"estado_conyugal"`, `"TABLE_rec2"`, `"numero_de_tratamientos_anteriores"`, `"usuario_de_tribunales_tratamiento_drogas"`.
# 
# The following variables have specific characteristics that should be considered:  
# 
# - `tiene_menores_de_edad_a_cargo` (responsible for minors): missing data is only present before 2015.
# - `numero_de_hijos` (number of children): values greater than 11 are incorrect.  
# - `usuario_de_tribunales_tratamiento_drogas` (drug treatment court user): contains more missing data in 2016.  
# - `pais_nacimiento` (country of birth): this information starts being collected in 2016.
# - `se_trata_de_una_mujer_embarazada` (pregnant woman): substantial missing data, but it is evenly distributed across annual datasets.  

# SISTRAT23_c1_2010_2024_df_prev1c|> 
#   dplyr::filter(hash_key %in% hashs_inconsistent_ages_post_rule_based_imp) |> View()

# SISTRAT23_c1_2010_2024_df_prev1c|>
#   dplyr::filter(hash_key %in% (proc_inconsistent_hashs_h_to_m_no_ext_long$hash_key[grepl("2.1.2.c1",proc_inconsistent_hashs_h_to_m_no_ext_long$obs)]))|> View()

dataset_with_na_birth_date<- 
 SISTRAT23_c1_2010_2024_df_prev1c|>
   dplyr::mutate(birth_date_for_imp= ifelse(hash_key %in% (proc_inconsistent_hashs_h_to_m_no_ext_long$hash_key[grepl("2.1.2.c1",proc_inconsistent_hashs_h_to_m_no_ext_long$obs)]),NA,birth_date))|>
  dplyr::select(any_of(c("rn","hash_key","birth_date_for_imp","birth_date","sexo", "tipo_centro", "tipo_de_plan", "pais_nacimiento","se_trata_de_una_mujer_embarazada", "escolaridad_ultimo_ano_cursado","sustancia_principal", "edad_inicio_sustancia_principal","tiene_menores_de_edad_a_cargo", "edad_inicio_consumo","numero_de_hijos", "estado_conyugal","TABLE_rec","numero_de_tratamientos_anteriores","usuario_de_tribunales_tratamiento_drogas"))) |> 
  mutate(se_trata_de_una_mujer_embarazada= ifelse(is.na(se_trata_de_una_mujer_embarazada),"no",se_trata_de_una_mujer_embarazada))|> 
  mutate(pais_nacimiento= ifelse(is.na(pais_nacimiento),"chile",pais_nacimiento))|> 
  mutate(tiene_menores_de_edad_a_cargo= ifelse(is.na(tiene_menores_de_edad_a_cargo),"no",tiene_menores_de_edad_a_cargo))

dataset_with_na_birth_date$birth_date_for_imp<- unclass(dataset_with_na_birth_date$birth_date_for_imp)
#sort missing proportions decreasing
missing_proportions <- colMeans(is.na(dataset_with_na_birth_date)) * 100
missing_proportions <- sort(missing_proportions, decreasing = TRUE)

#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_

set.seed(2125)
#https://mayer79.github.io/missRanger/articles/missRanger.html
SISTRAT23_c1_2010_2024_df_prev1c_imp <- missRanger::missRanger(
    data = dataset_with_na_birth_date,
    formula = birth_date_for_imp ~ sexo+ tipo_centro + tipo_de_plan + pais_nacimiento +
              se_trata_de_una_mujer_embarazada + escolaridad_ultimo_ano_cursado +
              sustancia_principal + edad_inicio_sustancia_principal +
              tiene_menores_de_edad_a_cargo + edad_inicio_consumo +
              numero_de_hijos + estado_conyugal+ 
      TABLE_rec+ numero_de_tratamientos_anteriores+ usuario_de_tribunales_tratamiento_drogas, 
    num.trees = 5e3,
    pmm.k = 3,  # Predictive mean matching
    keep_forests = T,
    returnOOB= T, 
    #mtry= function(p) max(3, trunc(p / 3)), # At least 3 or parameters/3, whichever is greater.
    maxiter= 5e2,
    verbose = 2,
    seed= 2125,
    #case.weights = rowSums(!is.na(SISTRAT23_c1_2010_2024_df_prev1f)) #pass case weights to the imputation models. For instance, this allows to reduce the contribution of rows with many missings
  )

Missing value imputation by random forests

Code
paste0("Best iter:", SISTRAT23_c1_2010_2024_df_prev1c_imp$best_iter)

paste0("Mtry: how many covariates are considered in each tree split: ", floor(sqrt(12)))
#Quick and balanced. More if there are many complexities to capture
#Reduce if there a few data of overadjustment

#OOB prediction error per iteration and variable (1 minus R-squared for regression)

#The default mtry in missRanger is sqrt(p), where p is the number of variables in the dataset.
#OOB prediction errors are quantified as 1 - R^2 for numeric variables, and as classification error otherwise. If a variable has been imputed only univariately, the value is 1.
#https://rdrr.io/cran/missRanger/man/missRanger.html

paste0("The model explains ",scales::percent(1-min(SISTRAT23_c1_2010_2024_df_prev1c_imp$mean_pred_errors), accuracy=.1), " of the variance. This is calculated using the out of bag samples in each tree split")
# [1] "The model explains 51.9% of the variance. This is calculated using the out of bag samples in each tree split"

#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
set.seed(2125)
SISTRAT23_c1_2010_2024_df_prev1c_imp_kNN<-VIM::kNN(dataset_with_na_birth_date, variable = c("birth_date_for_imp"), dist_var=c("sexo", "tipo_centro", "tipo_de_plan", "pais_nacimiento","se_trata_de_una_mujer_embarazada", "escolaridad_ultimo_ano_cursado", "sustancia_principal", "edad_inicio_sustancia_principal", "tiene_menores_de_edad_a_cargo", "edad_inicio_consumo","numero_de_hijos", "estado_conyugal", "TABLE_rec", "numero_de_tratamientos_anteriores", "usuario_de_tribunales_tratamiento_drogas"),
                                   numFun = "mean", 
                                   k=3,
                                   trace=T)

Detected as categorical variable:

hash_key,sexo,tipo_centro,tipo_de_plan,pais_nacimiento,se_trata_de_una_mujer_embarazada,escolaridad_ultimo_ano_cursado,sustancia_principal,tiene_menores_de_edad_a_cargo,estado_conyugal,TABLE_rec,usuario_de_tribunales_tratamiento_drogas,birth_date_for_imp_imp

Detected as ordinal variable:

Detected as numerical variable:

rn,birth_date_for_imp,edad_inicio_sustancia_principal,edad_inicio_consumo,numero_de_hijos,numero_de_tratamientos_anteriores

28items ofvariable:birth_date_for_imp imputed

Code
df_birth_date_imp<- 
cbind.data.frame(
  rn= SISTRAT23_c1_2010_2024_df_prev1c_imp$data$rn,
  hash_key= SISTRAT23_c1_2010_2024_df_prev1c_imp$data$hash_key,
  birth_date_for_imp_missranger= SISTRAT23_c1_2010_2024_df_prev1c_imp$data$birth_date_for_imp,
  birth_date_missranger= SISTRAT23_c1_2010_2024_df_prev1c_imp$data$birth_date,
  birth_date_for_imp_knn= SISTRAT23_c1_2010_2024_df_prev1c_imp_kNN$birth_date_for_imp_imp,
  birth_date_for_imp_knn_og= SISTRAT23_c1_2010_2024_df_prev1c_imp_kNN$birth_date_for_imp,
  miss_birth_dates= ifelse(SISTRAT23_c1_2010_2024_df_prev1c_imp$data$hash_key %in% (proc_inconsistent_hashs_h_to_m_no_ext_long$hash_key[grepl("2.1.2.c1",proc_inconsistent_hashs_h_to_m_no_ext_long$obs)]),1,0)
)

rm("SISTRAT23_c1_2010_2024_df_prev1c_imp_kNN")
rm("SISTRAT23_c1_2010_2024_df_prev1c_imp")

#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:

# Step 1: Convert dates to numeric (days since 1970-01-01)
# Step 2: Calculate pooled imputed value per hash_key
pooled_miss_birthdate_df <- df_birth_date_imp|> 
  dplyr::mutate(birth_date_numeric = as.numeric(as.Date(birth_date_missranger, origin = '1970-01-01')))|>
  dplyr::group_by(hash_key)|>
  dplyr::summarise(
    pooled_value = mean(c(birth_date_for_imp_missranger), na.rm = TRUE)
  )
# Step 3: Find closest original date to pooled value
closest_dates_birthdate <- df_birth_date_imp|>
  dplyr::filter(miss_birth_dates == 1)|>  # Only consider rows with original dates
  dplyr::left_join(pooled_miss_birthdate_df, by = "hash_key")|>
  dplyr::mutate(birth_date_numeric = as.numeric(as.Date(birth_date_missranger, origin = '1970-01-01')))|>
  dplyr::mutate(diff = abs(birth_date_numeric - pooled_value))|>
  dplyr::group_by(hash_key)|>
  dplyr::slice_min(diff, n = 1, with_ties = FALSE)|>  # Select closest date
  dplyr::select(hash_key, chosen_birth_date = birth_date_missranger)

# Step 4: Assign chosen date to all rows in each hash_key group
final_df_birthdate <- df_birth_date_imp%>%
  dplyr::left_join(closest_dates_birthdate, by = "hash_key")|>
  dplyr::mutate(
    birth_date_missranger_imp = coalesce(chosen_birth_date, birth_date_missranger)
  )|>
  dplyr::select(-chosen_birth_date)|> 
  dplyr::filter(miss_birth_dates==1)

rm("df_birth_date_imp")

Variables to impute:        birth_date_for_imp
Variables used to impute:   sexo, pais_nacimiento, se_trata_de_una_mujer_embarazada, sustancia_principal, edad_inicio_sustancia_principal, tiene_menores_de_edad_a_cargo, numero_de_hijos, estado_conyugal, TABLE_rec

    brt___
iter 1: Growing trees.. Progress: 17%. Estimated remaining time: 2 minutes, 31 seconds.
Growing trees.. Progress: 35%. Estimated remaining time: 1 minute, 56 seconds.
Growing trees.. Progress: 53%. Estimated remaining time: 1 minute, 22 seconds.
Growing trees.. Progress: 72%. Estimated remaining time: 48 seconds.
Growing trees.. Progress: 90%. Estimated remaining time: 16 seconds.
0.4800  
iter 2: Growing trees.. Progress: 19%. Estimated remaining time: 2 minutes, 13 seconds.
Growing trees.. Progress: 38%. Estimated remaining time: 1 minute, 39 seconds.
Growing trees.. Progress: 58%. Estimated remaining time: 1 minute, 8 seconds.
Growing trees.. Progress: 77%. Estimated remaining time: 37 seconds.
Growing trees.. Progress: 96%. Estimated remaining time: 5 seconds.
0.4799  
iter 3: Growing trees.. Progress: 19%. Estimated remaining time: 2 minutes, 13 seconds.
Growing trees.. Progress: 38%. Estimated remaining time: 1 minute, 42 seconds.
Growing trees.. Progress: 57%. Estimated remaining time: 1 minute, 10 seconds.
Growing trees.. Progress: 76%. Estimated remaining time: 38 seconds.
Growing trees.. Progress: 95%. Estimated remaining time: 8 seconds.
0.4800  
[1] "Best iter:2"
[1] "Mtry: how many covariates are considered in each tree split: 3"
[1] "The model explains 52.0% of the variance. This is calculated using the out of bag samples in each tree split"
Time difference of 3.10355 secs

We added the corrected birth date and admission age.

Code
# tidytable::as_tidytable(SISTRAT23_c1_2010_2024_df_prev00)|> 
#     tidytable::filter(grepl("c46caa3cd2c89a2222ce319cf6f5e98392f928e0544ee5487",hash_key)) |> glimpse()
#(proc_inconsistent_hashs_h_to_m_no_ext_long$hash_key[grepl("2.1.2.c1",proc_inconsistent_hashs_h_to_m_no_ext_long$obs)])
SISTRAT23_c1_2010_2024_df_prev1d<-
SISTRAT23_c1_2010_2024_df_prev1c|>
  tidylog::left_join(proc_birthdates_ext_data[,c("hash_key","sel_birth_date","obs")], by="hash_key", multiple="first")|> 
  tidylog::left_join(proc_inconsistent_hashs_h_to_m_no_ext_long[,c("hash_key","sel_birth_date","obs")], by="hash_key", multiple="first")|>
  tidylog::left_join(final_df_birthdate[,c("rn","birth_date_missranger_imp")], by="rn", multiple="first")|>
  tidytable::mutate(birth_date= tidytable::case_when(!is.na(obs.x)~ as.Date(sel_birth_date.x), T~birth_date))|> 
  tidytable::mutate(birth_date= tidytable::case_when(!is.na(obs.y)~ as.Date(sel_birth_date.y), T~birth_date))|> 
  tidytable::mutate(birth_date= tidytable::case_when(!is.na(birth_date_missranger_imp)~ birth_date_missranger_imp, T~birth_date))|> 
  tidytable::mutate(birth_date= as.Date(birth_date, origin = '1970-01-01'))|>
  #tidytable::mutate(adm_age= round(as.numeric((adm_date_rec-birth_date))/365.25,2))|>
  tidytable::mutate(adm_age= lubridate::time_length(lubridate::interval(birth_date, adm_date_rec), unit = "year"))|>
  tidytable::mutate(OBS = tidytable::case_when( !is.na(obs.x)~ glue("{OBS};{obs.x}"),T~OBS))|>
  tidytable::mutate(OBS = tidytable::case_when( !is.na(obs.y)~ glue("{OBS};{obs.y}"),T~OBS))|>
  tidytable::mutate(OBS = tidytable::case_when( hash_key %in% (proc_inconsistent_hashs_h_to_m_no_ext_long$hash_key[grepl("2.1.2.c1",proc_inconsistent_hashs_h_to_m_no_ext_long$obs)])~ glue("{OBS};2.1.2.c1-corr. Replaced birthdate with the closest value to the imputed value"),T~OBS))|>
  tidytable::mutate(OBS= gsub("^;", "", OBS))|> 
  tidytable::select(-any_of(as.vector(outer(c("obs","sel_birth_date"), c(".x",".y"), FUN = paste, sep = ""))))|> 
  tidytable::select(-any_of("birth_date_missranger_imp"))|> 
  tidytable::as_tidytable()

left_join: added 2 columns (sel_birth_date, obs) > rows only in SISTRAT23_c1_2010_2024_.. 163,070 > rows only in proc_birthdates_ext_dat.. ( 0) > matched rows 11,055 > ========= > rows total 174,125 left_join: added 4 columns (sel_birth_date.x, obs.x, sel_birth_date.y, obs.y) > rows only in tidylog::left_join(SIST.. 173,028 > rows only in proc_inconsistent_hashs.. ( 0) > matched rows 1,097 > ========= > rows total 174,125 left_join: added one column (birth_date_missranger_imp) > rows only in tidylog::left_join(tidy.. 174,099 > rows only in final_df_birthdate[, c(.. ( 0) > matched rows 26 > ========= > rows total 174,125

Code
message(paste0("Number of entries w/ infrequent (>90|<16) or missing admission ages= ",
    tidytable::filter(SISTRAT23_c1_2010_2024_df_prev1d, adm_age>90|adm_age<16|is.na(adm_age))|> nrow(),"\n(HASHs= ", 
    tidytable::filter(SISTRAT23_c1_2010_2024_df_prev1d, adm_age>90|adm_age<16|is.na(adm_age))|> distinct(hash_key)|> nrow(),")"))

Number of entries w/ infrequent (>90|<16) or missing admission ages= 260 (HASHs= 260)

Code
# Number of entries w/ infrequent (>90|<16) or missing admission ages= 287
# (HASHs= 273)
# AUG 2025
# Number of entries w/ infrequent (>90|<16) or missing admission ages= 286
# (HASHs= 272)
# Number of entries w/ infrequent (>90|<16) or missing admission ages= 260
# (HASHs= 260)

However, infrequent admission ages (>90 or <16) persisted ((n= 260; HASHs= 260)), prompting us to explore other databases.

2.2. Invalid admission ages

Code
invisible("Select HASHs")
hashs_invalid_adm_age<-
tidytable::filter(SISTRAT23_c1_2010_2024_df_prev1d, adm_age>90|adm_age<16|is.na(adm_age))|> 
  distinct(hash_key)|> 
  tidytable::pull(hash_key)

# Number of entries w/ infrequent (>90|<16) or missing admission ages= 260 length(hashs_invalid_adm_age)
# inconsistent_hashs_h_to_m_no_ext_long
# inconsistent_hashs_h_to_m

#hashs_invalid_adm_age

#table(grepl("a534782e4a670be9fe584f9b1b47a7f38a73f6b08f18aa7a7b4a45a7ba628dbf",hashs_invalid_adm_age))
#a534782e4a670be9fe584f9b1b47a7f38a73f6b08f18aa7a7b4a45a7ba628dbf is in the vector

invisible("Get external databases for our HASHs with invalid admission ages")

invisible("======================================================")
#6d5f2fc8d4c835e227ac7f99c96f710c235b0415d95571a976b481f9170a4c34
invalid_adm_age_hashs_hosp<-  
HOSP_filter_df|> 
    tidytable::filter(run %in% hashs_invalid_adm_age)|>
    (\(df) {
        (message(paste0("Hospital, Entries: ", nrow(df))))
        (message(paste0("Hospital, RUNs: ", tidytable::distinct(df, run) |> nrow())))
        df
    })()|>
    tidytable::distinct(run, fecha_nac)|>
    tidytable::group_by(run)|>
    tidytable::mutate(id = as.character(dplyr::row_number()))|>
    tidytable::pivot_wider(names_from = id, values_from = fecha_nac, 
                           names_prefix = "h_fechnac_")

Hospital, Entries: 550

Hospital, RUNs: 149

Code
invisible("Since this measure depended on the admission day and month, we calculated the average birth date.")
invalid_adm_age_hosp_avg<-  
HOSP_filter_df|> 
        tidytable::filter(run %in% hashs_invalid_adm_age)|> 
        tidytable::distinct(run, fecha_nac)|> 
        tidytable::group_by(run)|> 
        tidytable::summarise(h_avg_birth_date = as.Date(mean(as.numeric(fecha_nac), na.rm=T), origin="1970-01-01"), ndis_birth_date= n_distinct(fecha_nac))|> 
        tidytable::ungroup()
# Hospital, Entries: 558
# NULL
# Hospital, RUNs: 152
# NULL
# AUG 2025
# Hospital, Entries: 550
# NULL
# Hospital, RUNs: 149
# NULL
invisible("======================================================")
invalid_adm_age_top<-  
SISTRAT23_top_2015_2024_df|>
    tidytable::filter(HASH_KEY %in% hashs_invalid_adm_age)|>
    (\(df) {
        (message(paste0("TOP, Entries: ", nrow(df))))
        (message(paste0("TOP, RUNs: ", tidytable::distinct(df, HASH_KEY)|> nrow())))
        df
    })()|>  
    tidytable::distinct(HASH_KEY, birth_date)|>
    
    tidytable::ungroup()|>
    tidytable::group_by(HASH_KEY)|>
    tidytable::mutate(id = as.character(dplyr::row_number()))|>  # Convertir `id` a carácter
    tidytable::pivot_wider(names_from = id, values_from = birth_date, 
                           names_prefix = "t_fechnac_")

TOP, Entries: 169

TOP, RUNs: 55

Code
# TOP, Entries: 170
# NULL
# TOP, RUNs: 56
# NULL
# AUG 2025
# TOP, Entries: 169
# NULL
# TOP, RUNs: 55
# NULL

invisible("======================================================")
invalid_adm_age_c2<-  
CONS_C2_25_df |>
    tidytable::filter(HASH_KEY %in% hashs_invalid_adm_age)|>
    (\(df) {
        (message(paste0("C2, Entries: ", nrow(df))))
        (message(paste0("C2, RUNs: ", tidytable::distinct(df, HASH_KEY)|> nrow())))
        df
    })()|>  
    tidytable::distinct(HASH_KEY, birth_date)|>
    tidytable::ungroup()|>
    tidytable::group_by(HASH_KEY)|>
    tidytable::mutate(id = as.character(dplyr::row_number()))|>  # Convertir `id` a carácter
    tidytable::pivot_wider(names_from = id, values_from = birth_date, 
                           names_prefix = "c2_fechnac_")

C2, Entries: 16

C2, RUNs: 5

Code
# C2, Entries: 16
# NULL
# C2, RUNs: 5
# NULL
#AUG 2025
# C2, Entries: 16
# NULL
# C2, RUNs: 5
# NULL
invisible("======================================================")
invalid_adm_age_c3<-  
CONS_C3_25_df|>
    tidytable::filter(HASH_KEY %in% hashs_invalid_adm_age)|>
    (\(df) {
        (message(paste0("C3, Entries: ", nrow(df))))
        (message(paste0("C3, RUNs: ", tidytable::distinct(df, HASH_KEY)|> nrow())))
        df
    })()|>  
    tidytable::distinct(HASH_KEY, birth_date)|>
    tidytable::ungroup()|>
    tidytable::group_by(HASH_KEY)|>
    tidytable::mutate(id = as.character(dplyr::row_number()))|>  # Convertir `id` a carácter
    tidytable::pivot_wider(names_from = id, values_from = birth_date, 
                           names_prefix = "c3_fechnac_")

C3, Entries: 6

C3, RUNs: 4

Code
# C3, Entries: 4
# NULL
# C3, RUNs: 4
# NULL
# AUG 2025
# C3, Entries: 6
# NULL
# C3, RUNs: 4
# NULL

invisible("======================================================")
invalid_adm_age_c4<-  
CONS_C4_25_df |>
    tidytable::filter(HASH_KEY %in% hashs_invalid_adm_age)|>
    (\(df) {
        (message(paste0("C4, Entries: ", nrow(df))))
        (message(paste0("C4, RUNs: ", tidytable::distinct(df, HASH_KEY)|> nrow())))
        df
    })()|>  
    tidytable::distinct(HASH_KEY, birth_date) |>
    tidytable::ungroup()|>
    tidytable::group_by(HASH_KEY)|>
    tidytable::mutate(id = as.character(dplyr::row_number()))|>  # Convertir `id` a carácter
    tidytable::pivot_wider(names_from = id, values_from = birth_date, 
                           names_prefix = "c4_fechnac_")

C4, Entries: 0

C4, RUNs: 0

Code
#AUG 2025
# C4, Entries: 0
# NULL
# C4, RUNs: 0
# NULL
invisible("======================================================")
invalid_adm_age_c5<-  
CONS_C5_25_df |>
    tidytable::filter(HASH_KEY %in% hashs_invalid_adm_age)|>
    (\(df) {
        (message(paste0("C5, Entries: ", nrow(df))))
        (message(paste0("C5, RUNs: ", tidytable::distinct(df, HASH_KEY)|> nrow())))
        df
    })() |>  
    tidytable::distinct(HASH_KEY, birth_date)|>
    tidytable::ungroup()|>
    tidytable::group_by(HASH_KEY)|>
    tidytable::mutate(id = as.character(dplyr::row_number()))|>  # Convertir `id` a carácter
    tidytable::pivot_wider(names_from = id, values_from = birth_date, 
                           names_prefix = "c5_fechnac_")

C5, Entries: 0

C5, RUNs: 0

Code
#AUG 2025
# C5, Entries: 0
# NULL
# C5, RUNs: 0
# NULL
invisible("======================================================")
invalid_adm_age_c6<-  
CONS_C6_25_df |>
    tidytable::filter(HASH_KEY %in% hashs_invalid_adm_age)|>
    (\(df) {
        (message(paste0("C6, Entries: ", nrow(df))))
        (message(paste0("C6, RUNs: ", tidytable::distinct(df, HASH_KEY)|> nrow())))
        df
    })()|>  
    tidytable::distinct(HASH_KEY, birth_date)|>
    tidytable::ungroup()|>
    tidytable::group_by(HASH_KEY)|>
    tidytable::mutate(id = as.character(dplyr::row_number()))|>  # Convertir `id` a carácter
    tidytable::pivot_wider(names_from = id, values_from = birth_date, 
                           names_prefix = "c6_fechnac_")

C6, Entries: 0

C6, RUNs: 0

Code
#AUG 2025
# C6, Entries: 0
# NULL
# C6, RUNs: 0
# NULL
invisible("======================================================")
invalid_adm_age_mortality<-  
mortality |>
    tidytable::filter(hashkey %in% hashs_invalid_adm_age)|>
    (\(df) {
        (message(paste0("Mortality, Entries: ", nrow(df))))
        (message(paste0("Mortality, RUNs: ", tidytable::distinct(df, hashkey)|> nrow())))
        df
    })()|>  
    tidytable::distinct(hashkey, birth_date)|>
    tidytable::ungroup()|> 
    tidytable::rename("m_birthdate"="birth_date")

Mortality, Entries: 19

Mortality, RUNs: 19

Code
# Mortality, Entries: 19
# NULL
# Mortality, RUNs: 19
# NULL
#AUG 2025
# Mortality, Entries: 19
# NULL
# Mortality, RUNs: 19
# NULL

To have more alternative dates, we utilized the previous database (SISTRAT23_c1_2010_2024_df_prev1c) containing inconsistent birth dates. This allowed us to check if any replaced dates were valid, providing an alternative that could be used in this instance.

Code
invalid_adm_ages_previous_values<-
SISTRAT23_c1_2010_2024_df_prev1c|>
  tidytable::filter(hash_key %in% hashs_invalid_adm_age)|>
  tidylog::left_join(invalid_adm_age_hosp_avg, by=c("hash_key"="run"))|>
  tidytable::select(hash_key, birth_date, h_avg_birth_date)|>
  tidylog::left_join(invalid_adm_age_top, by=c("hash_key"="HASH_KEY"))|>
  tidylog::left_join(invalid_adm_age_c2, by=c("hash_key"="HASH_KEY"))|> 
  tidylog::left_join(invalid_adm_age_c3, by=c("hash_key"="HASH_KEY"))|> 
  tidylog::left_join(invalid_adm_age_c4, by=c("hash_key"="HASH_KEY"))|> 
  tidylog::left_join(invalid_adm_age_c5, by=c("hash_key"="HASH_KEY"))|>
  tidylog::left_join(invalid_adm_age_c6, by=c("hash_key"="HASH_KEY"))|> 
  tidylog::left_join(invalid_adm_age_mortality, by=c("hash_key"="hashkey"))|>
  tidytable::mutate_rowwise(
    non_NA_count = rowSums(!is.na(across(h_avg_birth_date:last_col())))
  )|> 
  dplyr::filter(non_NA_count>0)|> 
      (\(df) {
        (message(paste0("Invalid birth date that have at least one external birth date, Entries: ", nrow(df))))
        (message(paste0("Invalid birth date that have at least one external birth date, RUNs: ", tidytable::distinct(df, hash_key)|> nrow())))
        df
    })() 

left_join: added 2 columns (h_avg_birth_date, ndis_birth_date) > rows only in tidytable::filter(SISTR.. 114 > rows only in invalid_adm_age_hosp_avg ( 0) > matched rows 152 > ===== > rows total 266 left_join: added 2 columns (t_fechnac_1, t_fechnac_2) > rows only in tidytable::select(tidyl.. 209 > rows only in invalid_adm_age_top ( 0) > matched rows 57 > ===== > rows total 266 left_join: added one column (c2_fechnac_1) > rows only in tidylog::left_join(tidy.. 260 > rows only in invalid_adm_age_c2 ( 0) > matched rows 6 > ===== > rows total 266 left_join: added one column (c3_fechnac_1) > rows only in tidylog::left_join(tidy.. 262 > rows only in invalid_adm_age_c3 ( 0) > matched rows 4 > ===== > rows total 266 left_join: added no columns > rows only in tidylog::left_join(tidy.. 266 > rows only in invalid_adm_age_c4 ( 0) > matched rows 0 > ===== > rows total 266 left_join: added no columns > rows only in tidylog::left_join(tidy.. 266 > rows only in invalid_adm_age_c5 ( 0) > matched rows 0 > ===== > rows total 266 left_join: added no columns > rows only in tidylog::left_join(tidy.. 266 > rows only in invalid_adm_age_c6 ( 0) > matched rows 0 > ===== > rows total 266 left_join: added one column (m_birthdate) > rows only in tidylog::left_join(tidy.. 247 > rows only in invalid_adm_age_mortality ( 0) > matched rows 19 > ===== > rows total 266 Invalid birth date that have at least one external birth date, Entries: 178

Invalid birth date that have at least one external birth date, RUNs: 174

Code
#Invalid birth date that have at least one external birth date, Entries: 179
#Invalid birth date that have at least one external birth date, RUNs: 177
#AUG 2025
# Invalid birth date that have at least one external birth date, Entries: 178
# Invalid birth date that have at least one external birth date, RUNs: 174

invalid_adm_ages_no_ext_data<-
SISTRAT23_c1_2010_2024_df_prev1c|>
    tidytable::filter(hash_key %in% hashs_invalid_adm_age)|>
    tidylog::left_join(invalid_adm_age_hosp_avg, by=c("hash_key"="run"))|>
    tidytable::select(hash_key, birth_date, h_avg_birth_date)|>
    tidylog::left_join(invalid_adm_age_top, by=c("hash_key"="HASH_KEY"))|>
    tidylog::left_join(invalid_adm_age_c2, by=c("hash_key"="HASH_KEY"))|> 
    tidylog::left_join(invalid_adm_age_c3, by=c("hash_key"="HASH_KEY"))|> 
    tidylog::left_join(invalid_adm_age_c4, by=c("hash_key"="HASH_KEY"))|> 
    tidylog::left_join(invalid_adm_age_c5, by=c("hash_key"="HASH_KEY"))|>
    tidylog::left_join(invalid_adm_age_c6, by=c("hash_key"="HASH_KEY"))|> 
    tidylog::left_join(invalid_adm_age_mortality, by=c("hash_key"="hashkey"))|>
    tidytable::mutate_rowwise(
      non_NA_count = rowSums(!is.na(across(h_avg_birth_date:last_col())))
    )|> 
    dplyr::filter(non_NA_count==0)|> 
    (\(df) {
        (message(paste0("Invalid birth date without external birth dates, Entries: ", nrow(df))))
        (message(paste0("Invalid birth date without external birth dates, RUNs: ", tidytable::distinct(df, hash_key)|> nrow())))
        df
    })() 

left_join: added 2 columns (h_avg_birth_date, ndis_birth_date) > rows only in tidytable::filter(SISTR.. 114 > rows only in invalid_adm_age_hosp_avg ( 0) > matched rows 152 > ===== > rows total 266 left_join: added 2 columns (t_fechnac_1, t_fechnac_2) > rows only in tidytable::select(tidyl.. 209 > rows only in invalid_adm_age_top ( 0) > matched rows 57 > ===== > rows total 266 left_join: added one column (c2_fechnac_1) > rows only in tidylog::left_join(tidy.. 260 > rows only in invalid_adm_age_c2 ( 0) > matched rows 6 > ===== > rows total 266 left_join: added one column (c3_fechnac_1) > rows only in tidylog::left_join(tidy.. 262 > rows only in invalid_adm_age_c3 ( 0) > matched rows 4 > ===== > rows total 266 left_join: added no columns > rows only in tidylog::left_join(tidy.. 266 > rows only in invalid_adm_age_c4 ( 0) > matched rows 0 > ===== > rows total 266 left_join: added no columns > rows only in tidylog::left_join(tidy.. 266 > rows only in invalid_adm_age_c5 ( 0) > matched rows 0 > ===== > rows total 266 left_join: added no columns > rows only in tidylog::left_join(tidy.. 266 > rows only in invalid_adm_age_c6 ( 0) > matched rows 0 > ===== > rows total 266 left_join: added one column (m_birthdate) > rows only in tidylog::left_join(tidy.. 247 > rows only in invalid_adm_age_mortality ( 0) > matched rows 19 > ===== > rows total 266 Invalid birth date without external birth dates, Entries: 88

Invalid birth date without external birth dates, RUNs: 86

Code
#Invalid birth date without external birth dates, Entries: 144 #148 #110
#Invalid birth date without external birth dates, RUNs: 113 #115 #96
#AUG 2025
# Invalid birth date without external birth dates, Entries: 88
# Invalid birth date without external birth dates, RUNs: 86

#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#:#:
invisible("long format")

invalid_adm_ages_long<-
invalid_adm_ages_previous_values|>
  tidytable::group_by(hash_key)|> 
  tidytable::mutate(source_birth= dplyr::row_number())|> 
  tidytable::ungroup()|> 
  tidytable::select(hash_key, source_birth, everything())|> 
  tidytable::mutate(source_birth= paste0("orig_",source_birth))|> 
  tidytable::select(hash_key, source_birth, birth_date)|> 
  #first add hospital data
  tidytable::bind_rows(cbind.data.frame(hash_key= invalid_adm_age_hosp_avg$run, source_birth= rep("hosp_avg",times= nrow(invalid_adm_age_hosp_avg)), birth_date= invalid_adm_age_hosp_avg$h_avg_birth_date))|> 
  tidytable::bind_rows(cbind.data.frame(hash_key= reshape2::melt(invalid_adm_age_top, id="HASH_KEY")$HASH_KEY, source_birth= as.character(reshape2::melt(invalid_adm_age_top, id="HASH_KEY")$variable), birth_date= reshape2::melt(invalid_adm_age_top, id="HASH_KEY")$value))|> 
  tidytable::bind_rows(cbind.data.frame(hash_key= reshape2::melt(invalid_adm_age_c2, id="HASH_KEY")$HASH_KEY, source_birth= as.character(reshape2::melt(invalid_adm_age_c2, id="HASH_KEY")$variable), birth_date= reshape2::melt(invalid_adm_age_c2, id="HASH_KEY")$value))|> 
  tidytable::bind_rows(cbind.data.frame(hash_key= reshape2::melt(invalid_adm_age_c3, id="HASH_KEY")$HASH_KEY, source_birth= as.character(reshape2::melt(invalid_adm_age_c3, id="HASH_KEY")$variable), birth_date= reshape2::melt(invalid_adm_age_c3, id="HASH_KEY")$value))|>   
  tidytable::bind_rows(cbind.data.frame(hash_key= reshape2::melt(invalid_adm_age_c4, id="HASH_KEY")$HASH_KEY, source_birth= as.character(reshape2::melt(invalid_adm_age_c4, id="HASH_KEY")$variable), birth_date= reshape2::melt(invalid_adm_age_c4, id="HASH_KEY")$value))|>     
  tidytable::bind_rows(cbind.data.frame(hash_key= reshape2::melt(invalid_adm_age_c5, id="HASH_KEY")$HASH_KEY, source_birth= as.character(reshape2::melt(invalid_adm_age_c5, id="HASH_KEY")$variable), birth_date= reshape2::melt(invalid_adm_age_c5, id="HASH_KEY")$value))|>        tidytable::bind_rows(cbind.data.frame(hash_key= reshape2::melt(invalid_adm_age_c6, id="HASH_KEY")$HASH_KEY, source_birth= as.character(reshape2::melt(invalid_adm_age_c6, id="HASH_KEY")$variable), birth_date= reshape2::melt(invalid_adm_age_c6, id="HASH_KEY")$value))|> 
  tidytable::bind_rows(cbind.data.frame(hash_key= reshape2::melt(invalid_adm_age_mortality, id="hashkey")$hashkey, source_birth= as.character(reshape2::melt(invalid_adm_age_mortality, id="hashkey")$variable), birth_date= reshape2::melt(invalid_adm_age_mortality, id="hashkey")$value))|> 
  tidytable::arrange(hash_key, source_birth)|> 
  tidytable::filter(!is.na(birth_date))|> 
    (\(df) {
        (message(paste0("Long database of HASHs with invalid ages, Entries: ", nrow(df))))
        (message(paste0("Long database of HASHs with invalid ages, RUNs: ", tidytable::distinct(df, hash_key)|> nrow())))
        df
    })() 

Long database of HASHs with invalid ages, Entries: 410 Long database of HASHs with invalid ages, RUNs: 174

Code
# Long database of HASHs with invalid ages, Entries: 415
# Long database of HASHs with invalid ages, RUNs: 177
#AUG 2025
# Long database of HASHs with invalid ages, Entries: 410
# Long database of HASHs with invalid ages, RUNs: 174
# invalid_adm_ages_long$adm_date_rec<-NULL

We considered for the evaluation of valid birth dates, the substance use onset ages, age at first admission, the times where the birth date was recorded the same as the admission date, and a flag column called rec_min_adm_age with 1 values if the admission age was greater than 90 or lower than 3 years old.

Code
invisible("======================================================")
invalid_adm_age_may23_PO_office<-  
OLD_NEW_SISTRAT23_c1_2010_2024_df2|>
  tidylog::right_join(Base_fiscalia_v2, by=c("HASH_KEY.y"="rut_enc_saf"))|> 
  tidytable::select("HASH_KEY.x","HASH_KEY.y", "sexo.y","avg_birth_date_po")|> 
  tidytable::filter(HASH_KEY.x %in% hashs_invalid_adm_age)|>
    (\(df) {
        message(paste0("PO Office, Entries: ", nrow(df)))
        message(paste0("PO Office, RUNs: ", tidytable::distinct(df, HASH_KEY.x) |> nrow()))
        df
    })()|>  
    tidytable::distinct(HASH_KEY.x, avg_birth_date_po)|>
    tidytable::ungroup() 

right_join: added 5 columns (sexo.x, fec_nacimiento_simple, sexo.y, avg_birth_date_po, n_dis_birth_date_po) > rows only in OLD_NEW_SISTRAT23_c1_20.. ( 56,867) > rows only in Base_fiscalia_v2 30,256 > matched rows 1,164,249 (includes duplicates) > =========== > rows total 1,194,505 PO Office, Entries: 2322

PO Office, RUNs: 204

Code
#PO Office, Entries: 2488
#PO Office, RUNs: 212
#AUG 2025
# PO Office, Entries: 2322
# PO Office, RUNs: 204


invalid_adm_age_may23_PO_office_alt<-  
OLD_NEW_SISTRAT23_c1_2010_2024_df2_alt|>
  #discard overlappings in HASHs
  tidytable::filter(!HASH_KEY %in% OLD_NEW_SISTRAT23_c1_2010_2024_df2$HASH_KEY.x)|> 
  #join with PO Office
  tidylog::right_join(Base_fiscalia_v2, by=c("HASH_KEY_target"="rut_enc_saf"), multiple="first")|> 
  #select variables of interest
  tidytable::select("HASH_KEY","HASH_KEY_target", "sexo.y","avg_birth_date_po")|> 
  #filter incosistent birth dates only
  tidytable::filter(HASH_KEY %in% hashs_invalid_adm_age)|>
    (\(df) {
        (message(paste0("PO Office (alt., Aug 2025, not deterministically matched), Entries: ", nrow(df))))
        (message(paste0("PO Office (alt., Aug 2025, not deterministically matched), RUNs: ", tidytable::distinct(df, HASH_KEY) |> nrow())))
        df
    })()|>
  tidytable::distinct(HASH_KEY, avg_birth_date_po)

right_join: added 5 columns (sexo.x, fec_nacimiento_simple, sexo.y, avg_birth_date_po, n_dis_birth_date_po) > rows only in tidytable::filter(OLD_N.. ( 1,858) > rows only in Base_fiscalia_v2 524,344 > matched rows 30,694 (includes duplicates) > ========= > rows total 555,038 PO Office (alt., Aug 2025, not deterministically matched), Entries: 18

PO Office (alt., Aug 2025, not deterministically matched), RUNs: 18

Code
invisible("mixed alternative with main")
invalid_adm_age_PO_office <- 
bind_rows(dplyr::rename(invalid_adm_age_may23_PO_office,HASH_KEY = HASH_KEY.x), invalid_adm_age_may23_PO_office_alt)


#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
invisible("======================================================")
invisible("======================================================")


hash_key_miss_dates<- 
tidytable::filter(SISTRAT23_c1_2010_2024_df_prev1d, hash_key %in% hashs_invalid_adm_age) |> filter(is.na(birth_date)) |> pull(hash_key)
message(paste0("Missing birth dates: ",length(hash_key_miss_dates),"\n"))

Missing birth dates: 2

Code
#Missing birth dates: 2

# SISTRAT23_c1_2010_2024_df_prev1 |> filter(hash_key %in% hash_key_miss_dates) |> select(hash_key, birth_date, adm_date, edad) |> print(n=100)

summary_invalid_adm_ages_chars<-
tidytable::filter(SISTRAT23_c1_2010_2024_df_prev1d, hash_key %in% hashs_invalid_adm_age)|>
  tidytable::summarise(avg_prim_subs_adm= mean(edad_inicio_sustancia_principal, na.rm=T), avg_subs_onset= mean(edad_inicio_consumo, na.rm=T), min_adm_age= min(adm_age, na.rm=T),sum_eq_bd_adm= sum(adm_date_rec==birth_date),.by="hash_key",.groups="drop")|> 
  tidytable::mutate(rec_min_adm_age= ifelse(min_adm_age>90|min_adm_age<3,1,0)) 

Warning in min(adm_age, na.rm = T): ningún argumento finito para min; retornando Inf

Warning in min(adm_age, na.rm = T): ningún argumento finito para min; retornando Inf

Code
#two warnings: Warning in min(adm_age, na.rm = T) :
#Possibly due to missingness in birth date.

#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_

wdpath<-
paste0(gsub("/cons","",gsub("cons","",paste0(getwd(),"/cons"))))
envpath<- if(regmatches(wdpath, regexpr("[A-Za-z]+", wdpath))=="G"){"G:/Mi unidad/Alvacast/SISTRAT 2023/"}else{"E:/Mi unidad/Alvacast/SISTRAT 2023/"}
envpath
invisible("manual review of criteria to see invalid adm ages")

invalid_adm_ages_long_w_po_and_c1_info_orig_t <-
invalid_adm_ages_long|>
  tidytable::left_join(invalid_adm_age_PO_office, c("hash_key"="HASH_KEY"))|> 
  tidytable::left_join(summary_invalid_adm_ages_chars, .by="hash_key")|> 
  tidytable::mutate(main= ifelse(grepl("^orig_|^t_", source_birth),1,0))|> 
  tidytable::group_by(hash_key,main)|> 
  tidytable::mutate(ndis_birth_date= n_distinct(birth_date))|> 
  tidytable::ungroup()|> 
  tidytable::filter(main==1)|> 
  #2024-11-17: it was selecting only distinct birth dates, now it will only select distinct birthdates within each hash_key
  tidytable::group_by(hash_key)|> 
  tidytable::distinct(birth_date, .keep_all =T)|> 
  tidytable::ungroup()

invalid_adm_ages_long_w_po_and_c1_info_other <-
invalid_adm_ages_long|>
  tidytable::left_join(invalid_adm_age_PO_office, c("hash_key"="HASH_KEY"))|> 
  tidytable::left_join(summary_invalid_adm_ages_chars, .by="hash_key")|>
  tidytable::mutate(main= ifelse(grepl("^orig_|^t_", source_birth),1,0))|> 
  tidytable::group_by(hash_key,main)|> 
  tidytable::mutate(ndis_birth_date= n_distinct(birth_date))|> 
  tidytable::ungroup()|> 
  tidytable::filter(main==0)

invisible("If there are different values among TOP and C1, I kept every distinct ones to contrast w/ other sources")
invalid_adm_ages_long_w_po_and_c1_info<-
tidytable::bind_rows(invalid_adm_ages_long_w_po_and_c1_info_orig_t,
                     invalid_adm_ages_long_w_po_and_c1_info_other)|> 
  tidytable::arrange(hash_key, main, birth_date)

wdpath<-
paste0(gsub("/cons","",gsub("cons","",paste0(getwd(),"/cons"))))
envpath<- if(regmatches(wdpath, regexpr("[A-Za-z]+", wdpath))=="G"){"G:/Mi unidad/Alvacast/SISTRAT 2023/"}else{"E:/Mi unidad/Alvacast/SISTRAT 2023/"}
envpath
invalid_adm_ages_long_w_po_and_c1_info |> rio::export(file=paste0(wdpath,"cons/_out/invalid_adm_ages_long_25.csv"))

#031b27e9f5191197cd7db1aa85232937e424c47bc24c0922e = adm_date= birth_date

#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#
invisible("W/o ext data, long format. Check if any of the alternative birth dates is closer to mean")
invalid_adm_ages_no_ext_data_long<-
invalid_adm_ages_no_ext_data|>
  (\(df) {
        message(paste0("Invalid or no admission ages, No external data, Entries: ", nrow(df)))
        message(paste0("Invalid or no admission ages, No external data, RUNs: ", tidytable::distinct(df, hash_key)|> nrow()))
        df
    })() |> 
  tidytable::group_by(hash_key)|> 
  tidytable::mutate(source_birth= dplyr::row_number())|> 
  tidytable::ungroup()|> 
  tidytable::select(hash_key, source_birth, everything())|> 
  tidytable::mutate(source_birth= paste0("orig_",source_birth))|> 
  tidytable::select(hash_key, source_birth, birth_date) 

Invalid or no admission ages, No external data, Entries: 88 Invalid or no admission ages, No external data, RUNs: 86

Code
#110
#96
#AUG 2025
#Invalid or no admission ages, No external data, Entries: 88
#Invalid or no admission ages, No external data, RUNs: 86

invisible("Added with PO office records")
invalid_adm_ages_no_ext_data_long_w_po_and_c1_info <-
invalid_adm_ages_no_ext_data_long|>
  tidytable::left_join(invalid_adm_age_PO_office, c("hash_key"="HASH_KEY"))|>
  tidytable::left_join(summary_invalid_adm_ages_chars, .by="hash_key")|>
  tidytable::mutate(main= ifelse(grepl("^orig_|^t_", source_birth),1,0))|> 
  tidytable::group_by(hash_key,main)|> 
  tidytable::mutate(ndis_birth_date= n_distinct(birth_date))|> 
  tidytable::ungroup()|> 
  tidytable::filter(main==1)|> 
  tidytable::distinct(birth_date, .keep_all =T)

wdpath<-
paste0(gsub("/cons","",gsub("cons","",paste0(getwd(),"/cons"))))
envpath<- if(regmatches(wdpath, regexpr("[A-Za-z]+", wdpath))=="G"){"G:/Mi unidad/Alvacast/SISTRAT 2023/"}else{"E:/Mi unidad/Alvacast/SISTRAT 2023/"}
envpath

invalid_adm_ages_no_ext_data_long_w_po_and_c1_info|>     
  rio::export(file=paste0(wdpath,"cons/_out/invalid_adm_ages_long_w_ext_info25.csv"))
[1] "G:/Mi unidad/Alvacast/SISTRAT 2023/"
[1] "G:/Mi unidad/Alvacast/SISTRAT 2023/"
[1] "G:/Mi unidad/Alvacast/SISTRAT 2023/"

We need to consider that the oldest 1% were born on 1951-08-10, while the youngest 1% were born on 2002-01-23. Thus, the former could have been at most 70.3945205 years old, while the latter could have been at least 7.939726 years old at admission. Any admission age outside that range will be considered anomalous. However, some extremely implausible birth dates were those on or after Jan. 1, 2010, or on or before Jan. 1, 1910. According to Tukey’s criteria ( \(Q_{1|3} +/- 1.5\times IQR\) ), outliers were identified as dates earlier than 1949-11-18 or later than 2012-07-18.

Code
#https://bergant.github.io/bpmn/
wdpath<-
paste0(gsub("/cons","",gsub("cons","",paste0(getwd(),"/cons"))))
envpath<- if(regmatches(wdpath, regexpr("[A-Za-z]+", wdpath))=="G"){"G:/Mi unidad/Alvacast/SISTRAT 2023/"}else{"E:/Mi unidad/Alvacast/SISTRAT 2023/"}
envpath

unlink(paste0(wdpath,"cons/_figs/diagram_invalid_adm_ages_files"), recursive = TRUE)

# Construir el widget
w <- bpmn::bpmn(
  paste0(wdpath, "cons/_input/diagram_invalid_adm_ages.bpmn"),
  width = "100%", height = "800px"
)

# Adjuntar dependencia svg-pan-zoom desde CDN
dep <- htmltools::htmlDependency(
  name    = "svg-pan-zoom",
  version = "3.6.1",
  src     = c(href = "https://unpkg.com/svg-pan-zoom@3.6.1/dist"),
  script  = "svg-pan-zoom.min.js"
)
w <- htmltools::attachDependencies(w, dep)

# Inicializar pan/zoom
w <- htmlwidgets::onRender(w, "
function(el,x){
  function init(){
    var svg = el.querySelector('svg');
    if(!svg){ setTimeout(init, 50); return; }
    var panZoom = window.svgPanZoom(svg, {
      zoomEnabled: true,
      controlIconsEnabled: true,
      fit: true,
      center: true,
      minZoom: 0.3,
      maxZoom: 20
    });
    el.addEventListener('wheel', function(e){
      e.preventDefault();
      panZoom.zoomBy(e.deltaY > 0 ? 0.9 : 1.1);
    }, { passive: false });
  }
  init();
}
")
w

htmlwidgets::saveWidget(bpmn::bpmn(paste0(wdpath, "cons/_input/diagram_invalid_adm_ages.bpmn")), paste0(wdpath,"cons/_figs/diagram_invalid_adm_ages.html"))
webshot::webshot(paste0(wdpath,"cons/_figs/diagram_invalid_adm_ages.html"),paste0(wdpath,"cons/_figs/diagram_invalid_adm_ages.png"), vwidth = 300*1.2, vheight = 300,  zoom=10, expand=100)  # Prueba con diferentes coordenadas top, left, width, and height.
Workflow for rule-based selection of valid birth dates

Workflow for rule-based selection of valid birth dates

[1] "G:/Mi unidad/Alvacast/SISTRAT 2023/"

Workflow for rule-based selection of valid birth dates

We started selecting values for those cases with no other external official information provided (invalid_adm_ages_no_ext_data_long_w_po_and_c1_info). We ensured that birth dates were corrected accurately by considering both the proximity to the average birth date and the presence of anomalies based on conditional checks. Records requiring further attention were clearly flagged for potential probabilistic imputation. Cases where only one anomalous birth date exists are flagged, and imputation is recommended. For instances with multiple dates, the one closest to the average was selected. Records containing only anomalous birth dates are flagged, with imputation recommended. Lastly, records with no anomalies either retained the original date or used the date closest to the mean.

Code
#https://chatgpt.com/share/6738f80f-cd6c-8010-a7d1-cc8a5669bfcd

invisible("Define average birth date, and percentiles 1 and 99")
avg_birth_date<-mean(SISTRAT23_c1_2010_2024_df_prev1d$birth_date,na.rm=T)
q1_birth_date <- as.Date(quantile(unclass(SISTRAT23_c1_2010_2024_df_prev1d$birth_date), 0.01, na.rm=T), origin = '1970-01-01')
q99_birth_date <- as.Date(quantile(unclass(SISTRAT23_c1_2010_2024_df_prev1d$birth_date), 0.99, na.rm=T), origin = '1970-01-01')
avg_birth_date; q1_birth_date; q99_birth_date
invisible("Define anomalous admission ages")
# [1] "1980-01-12"
#           1% 
# "1951-02-17" 
#          99% 
# "2000-05-17"

invalid_adm_ages_no_ext_data_long_w_po_and_c1_info$min_adm_age_flag <- with(
  invalid_adm_ages_no_ext_data_long_w_po_and_c1_info,
  ifelse(min_adm_age > 71 | min_adm_age < 9, "anomalous", "")
)

cat("IQRs\n")
lubridate::time_length(interval(as.Date(quantile(unclass(SISTRAT23_c1_2010_2024_df_prev1d$birth_date),.25, na.rm=T) - 1.5*IQR(unclass(SISTRAT23_c1_2010_2024_df_prev1d$birth_date), na.rm=T)),as.Date("2022-01-01")), unit= "years")
lubridate::time_length(interval(as.Date(quantile(unclass(SISTRAT23_c1_2010_2024_df_prev1d$birth_date),.75, na.rm=T) + 1.5*IQR(unclass(SISTRAT23_c1_2010_2024_df_prev1d$birth_date), na.rm=T)),as.Date("2022-01-01")), unit= "years")

# [1] 72.1137
# [1] 10.59959 # AUG 2025: 9.461644

invisible("=======================================================")

invisible("Function to process cases with no ext. info.")
process_group_noextinfo <- function(df_group) {
  # Initialize 'sel_birth_date' & 'FLAG' columns
  df_group$sel_birth_date <- NA
  df_group$FLAG <- NA
  
  # Extract unique values & first if there is more than one
  avg_birth_date_po <- unique(df_group$avg_birth_date_po)[1]
  min_adm_age_flag <- unique(df_group$min_adm_age_flag)
  ndis <- unique(df_group$ndis_birth_date)
  
  # Obtain unique birth dates in the group
  birth_dates <- unique(df_group$birth_date)
  
  # Condition 1: Have a PO birth date & actual is anomalous
  if (!is.na(avg_birth_date_po) & min_adm_age_flag == "anomalous") {
    df_group$sel_birth_date <- avg_birth_date_po
    df_group$FLAG <- '2.2.1.a.Anomalous adm age, have a PO birth date, replace birth date'
  }
  
  # Condition 2: Have a PO birth date but is not anomalous
  else if (!is.na(avg_birth_date_po) & min_adm_age_flag == "") {
    # Include avg_birth_date_po among candidate values
    candidates <- c(birth_dates, avg_birth_date_po)
    candidates <- candidates[!is.na(candidates)]  # Eliminamos NAs
    # Calculate absolute difference with the average birth date
    diffs <- abs(as.numeric(candidates) - as.numeric(avg_birth_date))
    # Select the closest birth date to the average
    sel_date <- candidates[which.min(diffs)]
    df_group$sel_birth_date <- sel_date
    df_group$FLAG <- '2.2.1.b. Not anomalous adm age, have a PO birth date, replace birth date with the closest birth date to the average birth date'
  }
  
  # Condition 3: No PO birth date and anomalous
  else if (is.na(avg_birth_date_po) & min_adm_age_flag == "anomalous") {
    # If ndis == 1
    if (ndis == 1) {
      df_group$FLAG <- '2.2.1.c.Only 1 anomalous birth date, imputation is recommended'
      df_group$sel_birth_date <- NA
    }
    # IF ndis > 1 (more than one birth date in C1 database)
    else if (ndis > 1) {
      # Check if any date is between percentiles 1 & 99
      valid_dates <- birth_dates[birth_dates >= q1_birth_date & birth_dates <= q99_birth_date]
      if (length(valid_dates) > 0) {
        # Select the closest date to the average
        diffs <- abs(as.numeric(valid_dates) - as.numeric(avg_birth_date))
        sel_date <- valid_dates[which.min(diffs)]
        df_group$sel_birth_date <- sel_date
        df_group$FLAG <- '2.2.1.d.>1 distinct date, selected birth date w/ values closer to average birth date'
      } else {
        df_group$FLAG <- '2.2.1.e.Only anomalous birth dates, imputation is recommended'
        df_group$sel_birth_date <- NA
      }
    }
  }
  
  # Condition 4: No PO birth date & not anomalous
  else if (is.na(avg_birth_date_po) & min_adm_age_flag == "") {
    # If ndis > 1
    if (ndis > 1) {
      # Select the closest date to the mean
      diffs <- abs(as.numeric(birth_dates) - as.numeric(avg_birth_date))
      sel_date <- birth_dates[which.min(diffs)]
      df_group$sel_birth_date <- sel_date
      df_group$FLAG <- '2.2.1.f.No anomalous birth dates, used the closest to the mean'
    }
    # IF ndis == 1 (only one birth date)
    else if (ndis == 1) {
      df_group$sel_birth_date <- birth_dates[1]  # Only one birth date
      df_group$FLAG <- '2.2.1.g.Only one birth date, no anomalous values, kept the original birth date'
    }
  }
  
  return(df_group)
}

#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
# Apply the function to every hash_key
proc_invalid_adm_ages_no_ext_data_long_w_po_and_c1_info <- 
invalid_adm_ages_no_ext_data_long_w_po_and_c1_info|>
  dplyr::group_split(hash_key)|>
  purrr::map_dfr(~ process_group_noextinfo(.x))

#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
invisible("h and g do not have values")
proc_invalid_adm_ages_no_ext_data_long_w_po_and_c1_info |>
  dplyr::filter(grepl("2\\.2\\.1\\.a", FLAG)) |>
  (\(df) {
    message("2.2.1.a. Anomalous adm age, have a PO birth date, replace birth date")
    message(paste0("2.2.1.a, Entries: ", nrow(df)))
    message(paste0("2.2.1.a, RUNs: ", nrow(tidytable::distinct(df, hash_key))))
  })()

2.2.1.a. Anomalous adm age, have a PO birth date, replace birth date

2.2.1.a, Entries: 61

2.2.1.a, RUNs: 61

Code
proc_invalid_adm_ages_no_ext_data_long_w_po_and_c1_info |>
  dplyr::filter(grepl("2\\.2\\.1\\.b", FLAG)) |>
  (\(df) {
    message("2.2.1.b. Not anomalous adm age, have a PO birth date, replace with closest to average")
    message(paste0("2.2.1.b, Entries: ", nrow(df)))
    message(paste0("2.2.1.b, RUNs: ", nrow(tidytable::distinct(df, hash_key))))
  })()

2.2.1.b. Not anomalous adm age, have a PO birth date, replace with closest to average

2.2.1.b, Entries: 11

2.2.1.b, RUNs: 11

Code
proc_invalid_adm_ages_no_ext_data_long_w_po_and_c1_info |>
  dplyr::filter(grepl("2\\.2\\.1\\.c", FLAG)) |>
  (\(df) {
    message("2.2.1.c. Only 1 anomalous birth date, imputation is recommended")
    message(paste0("2.2.1.c, Entries: ", nrow(df)))
    message(paste0("2.2.1.c, RUNs: ", nrow(tidytable::distinct(df, hash_key))))
  })()

2.2.1.c. Only 1 anomalous birth date, imputation is recommended

2.2.1.c, Entries: 10

2.2.1.c, RUNs: 10

Code
proc_invalid_adm_ages_no_ext_data_long_w_po_and_c1_info |>
  dplyr::filter(grepl("2\\.2\\.1\\.d", FLAG)) |>
  (\(df) {
    message("2.2.1.d. >1 distinct date, selected birth date closer to average")
    message(paste0("2.2.1.d, Entries: ", nrow(df)))
    message(paste0("2.2.1.d, RUNs: ", nrow(tidytable::distinct(df, hash_key))))
  })()

2.2.1.d. >1 distinct date, selected birth date closer to average

2.2.1.d, Entries: 0

2.2.1.d, RUNs: 0

Code
proc_invalid_adm_ages_no_ext_data_long_w_po_and_c1_info |>
  dplyr::filter(grepl("2\\.2\\.1\\.e", FLAG)) |>
  (\(df) {
    message("2.2.1.e. Only anomalous birth dates, imputation is recommended")
    message(paste0("2.2.1.e, Entries: ", nrow(df)))
    message(paste0("2.2.1.e, RUNs: ", nrow(tidytable::distinct(df, hash_key))))
  })()

2.2.1.e. Only anomalous birth dates, imputation is recommended

2.2.1.e, Entries: 0

2.2.1.e, RUNs: 0

Code
proc_invalid_adm_ages_no_ext_data_long_w_po_and_c1_info |>
  dplyr::filter(grepl("2\\.2\\.1\\.f", FLAG)) |>
  (\(df) {
    message("2.2.1.f. No anomalous birth dates, used the closest to the mean")
    message(paste0("2.2.1.f, Entries: ", nrow(df)))
    message(paste0("2.2.1.f, RUNs: ", nrow(tidytable::distinct(df, hash_key))))
  })()

2.2.1.f. No anomalous birth dates, used the closest to the mean

2.2.1.f, Entries: 0

2.2.1.f, RUNs: 0

Code
proc_invalid_adm_ages_no_ext_data_long_w_po_and_c1_info |>
  dplyr::filter(grepl("2\\.2\\.1\\.g", FLAG)) |>
  (\(df) {
    message("2.2.1.g. Only one birth date, no anomalous values, kept the original")
    message(paste0("2.2.1.g, Entries: ", nrow(df)))
    message(paste0("2.2.1.g, RUNs: ", nrow(tidytable::distinct(df, hash_key))))
  })()

2.2.1.g. Only one birth date, no anomalous values, kept the original

2.2.1.g, Entries: 1

2.2.1.g, RUNs: 1

Code
# 2.2.1.a.Anomalous adm age, have a PO birth date, replace birth date
# 2.2.1.b. Not anomalous adm age, have a PO birth date, replace birth date with the closest birth date to the average birth date
# 2.2.1.c.Only 1 anomalous birth date, imputation is recommended
# 2.2.1.d.>1 distinct date, selected birth date w/ values closer to average birth date
# 2.2.1.e.Only anomalous birth dates, imputation is recommended
# 2.2.1.f.No anomalous birth dates, used the closest to the mean
# 2.2.1.g.Only one birth date, no anomalous values, kept the original birth date
# 2.2.1.a, Entries: 12 #61
# 2.2.1.a, RUNs: 12 $61
# 2.2.1.b, Entries: 2  #11
# 2.2.1.b, RUNs: 2 #11
# 2.2.1.c, Entries: 58 #10
# 2.2.1.c, RUNs: 58 #10
# 2.2.1.d, Entries: 64 #0
# 2.2.1.d, RUNs: 31 #0
# 2.2.1.e, Entries: 0 #0
# 2.2.1.e, RUNs: 0 #0
# 2.2.1.f, Entries: 0 #0
# 2.2.1.f, RUNs: 0 #0
# 2.2.1.g, Entries: 10 #1
# 2.2.1.g, RUNs: 10 #1
#8e3591364bcc3eb5630f2db00e6f15bd57c07604e7c60f6086151f81f38b3727, no lo codificó bien, debiese haber sido d.

cat(paste0("Warning, Unknown or uninitialised column: `min_adm_age_flag`: ",174,"\n"))
#or some groups passed into process_group_noextinfo(), the column min_adm_age_flag doesn’t exist (or it exists in the big data frame but is dropped upstream), and you access it with df_group$min_adm_age_flag. In a tibble, using $ on a missing column triggers exactly that warning.
[1] "1980-10-13"
          1% 
"1951-08-10" 
         99% 
"2002-01-23" 
IQRs
[1] 72.12055
[1] 9.457534
Warning, Unknown or uninitialised column: `min_adm_age_flag`: 174

In our dataset, we encountered individuals with multiple birth dates from various sources, including external records such as hospital admission dates (hosp_avg). To improve data integrity and select the most reliable birth date for each individual, we applied a systematic set of rules, each labeled for reference (e.g., 2.2.2.a to 2.2.2.i). These required the removal of implausible birth dates (outside the range 1910-01-01 to 2010-01-01), prioritizing reliable sources (mortality > hospital > PO if within valid dates), analyzing multiple valid birth dates, handling single valid birth dates after removal, and fallback options.

Code
#https://chatgpt.com/c/6737c7a6-4318-8010-98f3-512061b56100

q1_birth_date <- as.Date(quantile(unclass(SISTRAT23_c1_2010_2024_df_prev1d$birth_date), 0.01, na.rm=T), origin = '1970-01-01')
q99_birth_date <- as.Date(quantile(unclass(SISTRAT23_c1_2010_2024_df_prev1d$birth_date), 0.99, na.rm=T), origin = '1970-01-01')

invisible("Define anomalous admission ages")
invalid_adm_ages_long_w_po_and_c1_info$min_adm_age_flag <- with(
  invalid_adm_ages_long_w_po_and_c1_info,
  ifelse(min_adm_age > 71 | min_adm_age < 9, "anomalous", "")
)

invisible("=======================================================")
invisible("")

# Function to process each group
process_inv_birthdates_w_info <- function(df_group) {
  # Initialize 'sel_birth_date', 'FLAG', and 'obs'
  df_group <- df_group |> 
    mutate(sel_birth_date = as.Date(NA),
           FLAG = NA,
           obs = NA) |> 
  # AGS: eliminate incorrect dates
    mutate(
  birth_date = case_when(
    birth_date >= as.Date("2010-01-01") ~ as.Date(NA),
    birth_date <= as.Date("1910-01-01") ~ as.Date(NA),
    TRUE ~ birth_date
  )
)
  # Extract unique values needed
  m_birthdate <- unique(df_group$birth_date[df_group$source_birth == "m_birthdate"])
  #AUG 2025, added the alternative date just in case
  avg_birth_date_po <- unique(c(df_group$avg_birth_date_po))
  #sum_eq_adm <- unique(df_group$sum_eq_adm)
  min_adm_age_flag <- unique(df_group$min_adm_age_flag)
  ndis_birth_date <- unique(df_group$ndis_birth_date)
  birth_dates <- unique(df_group$birth_date)
  
  # Remove NA values from birth_dates
  birth_dates <- birth_dates[!is.na(birth_dates)]
  # Rule 1: Prioritize 'm_birthdate' if available (obs_label = "2.2.2.a")
  if (length(m_birthdate) > 0) {
    df_group <- df_group %>%
      mutate(sel_birth_date = m_birthdate,
             obs = "2.2.2.a. Prioritized birthdate of mortality database")
    return(df_group)
  }
  # Rule 2: Use 'avg_birth_date_po' if within valid range (obs_label = "2.2.2.b")
  if (is.na(df_group$sel_birth_date[1]) & !is.na(avg_birth_date_po)) {
    # Check if 'avg_birth_date_po' is within valid dates
    if (avg_birth_date_po >= q1_birth_date & avg_birth_date_po <= q99_birth_date) {
      df_group <- df_group %>%
        mutate(sel_birth_date = avg_birth_date_po,
               obs = "2.2.2.b. Used PO date if within percentiles 1 and 99")
      return(df_group)
    }
  }
  # Rule 3: Analyze multiple birth dates (obs_label = "2.2.2.c")
  if (length(birth_dates) > 1) {
    # Filter birth_dates within valid range
    valid_birth_dates <- birth_dates[birth_dates >= q1_birth_date & birth_dates <= q99_birth_date]
    
    if (length(valid_birth_dates) > 0) {
      # Check if any of the valid_birth_dates come from 'hosp_avg' source
      hosp_avg_dates <- df_group$birth_date[df_group$source_birth == "hosp_avg"]
      valid_hosp_avg_dates <- intersect(valid_birth_dates, hosp_avg_dates)
        # Use day and month from other sources
      other_b_dates <- setdiff(valid_birth_dates, hosp_avg_dates)
      # Proceed to select the most frequent date
      # Select the most frequent date among valid_birth_dates
      other_b_dates_mfv <- as.numeric(names(sort(-table(other_b_dates)))[1])
      if (length(valid_hosp_avg_dates)>0 & length(other_b_dates)>0){
        # Rule 7: Use 'hosp_avg' date for year information (obs_label = "2.2.2.g")
        avg_hosp_date <- unique(valid_hosp_avg_dates)
        # print(paste0("avg hosp date:",as.Date(avg_hosp_date[1])))
        avg_hosp_year <- as.integer(format(as.Date(avg_hosp_date[1]), "%Y"))
        # print(paste0("avg hosp year:",avg_hosp_year))
        # print(paste0("other dates,mfv:",as.Date.numeric(other_b_dates_mfv)))
        day_month <- format(as.Date.numeric(other_b_dates_mfv), "%m-%d")
        new_birth_date_str <- paste(avg_hosp_year, day_month, sep = "-")
        new_birth_date <- as.Date(new_birth_date_str, format = "%Y-%m-%d")
        # # Check if new_birth_date is within valid range
        # if (!is.na(new_birth_date) && new_birth_date >= q1_birth_date & new_birth_date <= q99_birth_date) {
          df_group <- df_group %>%
            mutate(sel_birth_date = new_birth_date,
                   obs = "2.2.2.g. Select hospital birth year, keep month & day of other sources")
          return(df_group)
        # }
      }
      # Proceed to select the most frequent date
      # Select the most frequent date among valid_birth_dates
      birth_date_counts <- table(valid_birth_dates)
      max_count <- max(birth_date_counts)
      most_freq_dates <- as.Date(names(birth_date_counts[birth_date_counts == max_count]))
      
      # If there's a tie, select the date closest to average birth date
      if (length(most_freq_dates) > 1) {
        diffs <- abs(as.numeric(most_freq_dates) - as.numeric(avg_birth_date))
        sel_date <- most_freq_dates[which.min(diffs)]
      } else {
        sel_date <- most_freq_dates
      }
      
      df_group <- df_group %>%
        mutate(sel_birth_date = sel_date,
               obs = "2.2.2.c.Multiple birth dates. Chose the most frequent. If ties, selected the closest to the average date")
      return(df_group)
    } else {
      # No valid birth dates within the percentile range
      # Proceed to select date closest to average birth date from all birth_dates
      diffs <- abs(as.numeric(birth_dates) - as.numeric(avg_birth_date))
      sel_date <- birth_dates[which.min(diffs)]
      df_group <- df_group %>%
        mutate(sel_birth_date = sel_date,
               FLAG = 'Selected date closest to average birth date',
               obs = "2.2.2.d.No valid birthdate. Select closest to average")  # Rule 4 applied here
      return(df_group)
    }
  }
  # Rule 5: If only one birth date is available (obs_label = "2.2.2.e")
  if (length(birth_dates) == 1) {
    # Check if it's within valid range
    if (birth_dates >= q1_birth_date & birth_dates <= q99_birth_date) {
      df_group <- df_group %>%
        mutate(sel_birth_date = birth_dates,
               obs = "2.2.2.e. Single birth date available, selected")
      return(df_group)
    } else {
      # Birth date is outside valid range, select it but flag it
      sel_date <- birth_dates
      df_group <- df_group %>%
        mutate(sel_birth_date = sel_date,
               FLAG = 'Single date outside valid range, selected as is',
               obs = "2.2.2.f.Single date outside valid range, selected as is")  # Rule 6 applied here
      return(df_group)
    }
  }
  # Rule 8: Select date closest to average birth date (obs_label = "2.2.2.h")
  if (length(birth_dates) > 0) {
    diffs <- abs(as.numeric(birth_dates) - as.numeric(avg_birth_date))
    sel_date <- birth_dates[which.min(diffs)]
    df_group <- df_group %>%
      mutate(sel_birth_date = sel_date,
             FLAG = 'Selected date closest to average birth date',
             obs = "2.2.2.h.Selected date closest to average birth date")
    return(df_group)
  }
  # Rule 9: Flag records with unresolvable inconsistencies (obs_label = "2.2.2.i")
  df_group <- df_group %>%
    mutate(FLAG = 'Unresolvable inconsistencies',
           obs = "2.2.2.i.Unresolvable inconsistencies or extremely implausible birth dates")
  
  return(df_group)
}

#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
process_safe <- purrr::safely(process_inv_birthdates_w_info)

# Apply the function to each group and combine the results
proc_invalid_adm_ages_long_w_po_and_c1_info <- invalid_adm_ages_long_w_po_and_c1_info|> 
  dplyr::group_split(hash_key)|>
  #purrr::map_dfr(~ process_safe(.x)$result)
  purrr::map_dfr(process_inv_birthdates_w_info)

#AUG 2025
#SISTRAT23_c1_2010_2024_df_prev1d |> filter(hash_key=="f23b9b3b7cbaf2510fd998d6324e08e46c312cd127fffe78fbaa24c6fc5aff5b") |> glimpse()
#NAs result from invalid dates, so they might not be replaceable

cat(paste0("HASHs w/ C1 missing birth date: ", proc_invalid_adm_ages_long_w_po_and_c1_info|> filter(is.na(birth_date))|> nrow(),"\n")) # 145 AUG 2025
cat(paste0("Warning, Unknown or uninitialised column: `min_adm_age_flag`: ",174,"\n"))

#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
invisible("h and g do not have values")
cases <- c(
  "2.2.2.a" = "Prioritized birthdate of mortality database",
  "2.2.2.b" = "Used PO date if within percentiles 1 and 99",
  "2.2.2.c" = "Multiple birth dates; chose most frequent; ties -> closest to average",
  "2.2.2.d" = "No valid birthdate; selected closest to average",
  "2.2.2.e" = "Single birth date available; selected",
  "2.2.2.f" = "Single date outside valid range; selected as is",
  "2.2.2.g" = "Get birth year from hospital avg; keep month/day from other sources",
  "2.2.2.h" = "Selected date closest to average birth date",
  "2.2.2.i" = "Unresolvable inconsistencies or extremely implausible birth dates"
)

purrr::iwalk(cases, function(desc, code){
  pat <- gsub("\\.", "\\\\.", code)
  df <- proc_invalid_adm_ages_long_w_po_and_c1_info |>
    dplyr::filter(grepl(pat, obs))
  message(code, ". ", desc)
  message(code, ", Entries: ", nrow(df))
  message(code, ", RUNs: ", dplyr::n_distinct(df$hash_key))
})

2.2.2.a. Prioritized birthdate of mortality database

2.2.2.a, Entries: 54

2.2.2.a, RUNs: 18

2.2.2.b. Used PO date if within percentiles 1 and 99

2.2.2.b, Entries: 256

2.2.2.b, RUNs: 133

2.2.2.c. Multiple birth dates; chose most frequent; ties -> closest to average

2.2.2.c, Entries: 2

2.2.2.c, RUNs: 1

2.2.2.d. No valid birthdate; selected closest to average

2.2.2.d, Entries: 0

2.2.2.d, RUNs: 0

2.2.2.e. Single birth date available; selected

2.2.2.e, Entries: 28

2.2.2.e, RUNs: 15

2.2.2.f. Single date outside valid range; selected as is

2.2.2.f, Entries: 4

2.2.2.f, RUNs: 2

2.2.2.g. Get birth year from hospital avg; keep month/day from other sources

2.2.2.g, Entries: 8

2.2.2.g, RUNs: 3

2.2.2.h. Selected date closest to average birth date

2.2.2.h, Entries: 0

2.2.2.h, RUNs: 0

2.2.2.i. Unresolvable inconsistencies or extremely implausible birth dates

2.2.2.i, Entries: 2

2.2.2.i, RUNs: 2

Code
# 2.2.2.a, Entries: 54 #54
# NULL
# 2.2.2.a, RUNs: 18 #18
# NULL
# 2.2.2.b, Entries: 36 #232
# NULL
# 2.2.2.b, RUNs: 18 #122
# NULL
# 2.2.2.c, Entries: 11 #2
# NULL
# 2.2.2.c, RUNs: 5 #1
# NULL
# 2.2.2.d, Entries: 18 #14
# NULL
# 2.2.2.d, RUNs: 8 #6
# NULL
# 2.2.2.e, Entries: 183 #35
# NULL
# 2.2.2.e, RUNs: 92 #18
# NULL
# 2.2.2.f, Entries: 7 #7
# NULL
# 2.2.2.f, RUNs: 4 #4
# NULL
# 2.2.2.g, Entries: 31 #11
# NULL
# 2.2.2.g, RUNs: 13 #4
# NULL
# 2.2.2.h, Entries: 0 #0
# NULL
# 2.2.2.h, RUNs: 0 #0
# NULL
# 2.2.2.i, Entries: 19 #4
# NULL
# 2.2.2.i, RUNs: 19 #4

#AUG 2025, 
# 2.2.2.a. Prioritized birthdate of mortality database
# 2.2.2.a, Entries: 54
# 2.2.2.a, RUNs: 18
# 2.2.2.b. Used PO date if within percentiles 1 and 99
# 2.2.2.b, Entries: 256
# 2.2.2.b, RUNs: 133
# 2.2.2.c. Multiple birth dates; chose most frequent; ties -> closest to average
# 2.2.2.c, Entries: 2
# 2.2.2.c, RUNs: 1
# 2.2.2.d. No valid birthdate; selected closest to average
# 2.2.2.d, Entries: 0
# 2.2.2.d, RUNs: 0
# 2.2.2.e. Single birth date available; selected
# 2.2.2.e, Entries: 28
# 2.2.2.e, RUNs: 15
# 2.2.2.f. Single date outside valid range; selected as is
# 2.2.2.f, Entries: 4
# 2.2.2.f, RUNs: 2
# 2.2.2.g. Get birth year from hospital avg; keep month/day from other sources
# 2.2.2.g, Entries: 8
# 2.2.2.g, RUNs: 3
# 2.2.2.h. Selected date closest to average birth date
# 2.2.2.h, Entries: 0
# 2.2.2.h, RUNs: 0
# 2.2.2.i. Unresolvable inconsistencies or extremely implausible birth dates
# 2.2.2.i, Entries: 2
# 2.2.2.i, RUNs: 2

#b002ce96b750e7e32aa9d2bdb611520e938293b7221415784fdb2fceeac52a8f, opciones raras, o 1917 o 2002
#a534782e4a670be9fe584f9b1b47a7f38a73f6b08f18aa7a7b4a45a7ba628dbf
#a3f5934a1b72932a34d89c71aec939a327499373107718abb42f437b5bc94254 por qué sólo una
#proc_invalid_adm_ages_long_w_po_and_c1_info |> filter(hash_key=="b002ce96b750e7e32aa9d2bdb611520e938293b7221415784fdb2fceeac52a8f") |> glimpse()
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
HASHs w/ C1 missing birth date: 145
Warning, Unknown or uninitialised column: `min_adm_age_flag`: 174

We added the corrected birth date and admission age.

Code
invisible("Check any errors in conversion of invalid ages: no ext data")
message(dplyr::inner_join(proc_birthdates_ext_data,proc_invalid_adm_ages_no_ext_data_long_w_po_and_c1_info, "hash_key") |> nrow())

0

Code
invisible("Check any errors in conversion of invalid ages: w/ ext data")
hashs_inconsistent_and_invalid_ages<-
proc_birthdates_ext_data |> 
  dplyr::inner_join(proc_invalid_adm_ages_long_w_po_and_c1_info, "hash_key", multiple="first")

#e3d18a65489e87325c1a4e59286283a211b7ef31505bc48a4f62956cc491f5d3 dice que fue el único caso (2.2.2.e. Single birth date available, selected)

invisible("a534782e4a670be9fe584f9b1b47a7f38a73f6b08f18aa7a7b4a45a7ba628dbf")
#a534782e4a670be9fe584f9b1b47a7f38a73f6b08f18aa7a7b4a45a7ba628dbf only dates below 16
#already noticed in earlier stages

SISTRAT23_c1_2010_2024_df_prev1e<-
SISTRAT23_c1_2010_2024_df_prev1d|>
  tidylog::left_join(proc_invalid_adm_ages_no_ext_data_long_w_po_and_c1_info[,c("hash_key","sel_birth_date","FLAG")], by="hash_key", multiple="first")|> 
  tidytable::rename("obs_invalid_dates_noext"="FLAG")|> 
  tidylog::left_join(proc_invalid_adm_ages_long_w_po_and_c1_info[,c("hash_key","sel_birth_date","obs")], by="hash_key", multiple="first")|> 
  tidytable::rename("obs_invalid_dates_ext"="obs")|>   
  tidytable::mutate(birth_date_rec = tidytable::case_when( !is.na(obs_invalid_dates_noext)~ as.Date(sel_birth_date.x), T~birth_date))|> 
  tidytable::mutate(birth_date_rec = tidytable::case_when( !is.na(obs_invalid_dates_ext)~ as.Date(sel_birth_date.y), T~birth_date_rec))|>   
  tidytable::mutate(adm_age_rec= lubridate::time_length(lubridate::interval(birth_date_rec,adm_date_rec), "years"))|>
  tidytable::mutate(OBS = tidytable::case_when( !is.na(obs_invalid_dates_noext)~ glue("{OBS};{obs_invalid_dates_noext}"),T~OBS))|>
  tidytable::mutate(OBS = tidytable::case_when( !is.na(obs_invalid_dates_ext)~ glue("{OBS};{obs_invalid_dates_ext}"),T~OBS))|>
  tidytable::mutate(OBS= gsub("^;", "", OBS))|>
  tidytable::select(-any_of(c("sel_birth_date.x","obs_invalid_dates_noext","sel_birth_date.y","obs_invalid_dates_ext")))|>
  tidytable::as_tidytable()

left_join: added 2 columns (sel_birth_date, FLAG)

       > rows only in SISTRAT23_c1_2010_2024_..  174,040
       > rows only in proc_invalid_adm_ages_n.. (      0)
       > matched rows                                 85
       >                                        =========
       > rows total                              174,125

left_join: added 3 columns (sel_birth_date.x, sel_birth_date.y, obs) > rows only in tidytable::rename(tidyl.. 173,947 > rows only in proc_invalid_adm_ages_l.. ( 0) > matched rows 178 > ========= > rows total 174,125

Code
message(paste0("Number of entries w/ infrequent admission ages (>90|<16) or missing data= ",
    tidytable::filter(SISTRAT23_c1_2010_2024_df_prev1e, adm_age_rec>90|adm_age_rec<16|is.na(adm_age_rec))|> nrow(),"\n(HASHs= ", 
    tidytable::filter(SISTRAT23_c1_2010_2024_df_prev1e, adm_age_rec>90|adm_age_rec<16|is.na(adm_age_rec))|> distinct(hash_key) |> nrow(),")"))

Number of entries w/ infrequent admission ages (>90|<16) or missing data= 48 (HASHs= 48)

Code
# Number of entries w/ infrequent admission ages (>90|<16) or missing data= 48
# (HASHs= 48)
message(paste0("Number of entries w/ missing data only= ",
    tidytable::filter(SISTRAT23_c1_2010_2024_df_prev1e, is.na(adm_age_rec))|> nrow(),"\n(HASHs= ", 
    tidytable::filter(SISTRAT23_c1_2010_2024_df_prev1e, is.na(adm_age_rec))|> distinct(hash_key) |> nrow(),")"))

Number of entries w/ missing data only= 13 (HASHs= 13)

Code
#6d5f2fc8d4c835e227ac7f99c96f710c235b0415d95571a976b481f9170a4c34 has a missing date

# Number of entries w/ missing data only= 13
# (HASHs= 13)


tidytable::filter(SISTRAT23_c1_2010_2024_df_prev1e, adm_age_rec>90|adm_age_rec<16)|>
    pull(adm_age_rec)|> 
    hist(breaks=50, main= "Infrequent ages (>90|<16)", xlab= "Admission age (years)")

For example, the ID a534782e4a670be9fe584f9b1b47a7f38a73f6b08f18aa7a7b4a45a7ba628dbf was associated only with age entries below 16 years old, also corroborated by external information.

Imputation methods were evaluated in the script hist_scripts/Duplicates_24_imp_methods.R. However, the imputation was not sufficiently precise —specifically, the imputed admission ages spanned nearly 10 years— so we chose not to use it. We discarded IDs with ages <13 and with missing ages.

Code
#hashs_invalid_adm_age

# We explored 3 types of imputation: using k-nearest neighbours, random forests and multiple imputation with chained equations. The variables used as candidates were: `"sexo"`, `"tipo_centro"`, `"tipo_de_plan"`, `"pais_nacimiento"`, `"se_trata_de_una_mujer_embarazada"`, `"escolaridad_ultimo_ano_cursado"`, `"sustancia_principal"`, `"edad_inicio_sustancia_principal"`, `"tiene_menores_de_edad_a_cargo"`, `"edad_inicio_consumo"`, `"numero_de_hijos"`, `"estado_conyugal"`, `"TABLE_rec2"`, `"numero_de_tratamientos_anteriores"`, `"usuario_de_tribunales_tratamiento_drogas"`.
# 
# The following variables have specific characteristics that should be considered:  
# 
# - `tiene_menores_de_edad_a_cargo` (responsible for minors): missing data is only present before 2015.
# - `numero_de_hijos` (number of children): values greater than 11 are incorrect.  
# - `usuario_de_tribunales_tratamiento_drogas` (drug treatment court user): contains more missing data in 2016.  
# - `pais_nacimiento` (country of birth): this information starts being collected in 2016.
# - `se_trata_de_una_mujer_embarazada` (pregnant woman): substantial missing data, but it is evenly distributed across annual datasets.  

invisible("Cases with inconsistencies in birth date due to modifications of rule based protocols")
hashs_inconsistent_ages_post_rule_based_imp<-
SISTRAT23_c1_2010_2024_df_prev1e|> 
  tidytable::group_by(hash_key)|> 
  tidytable::summarise(n = n_distinct(birth_date_rec))|> 
  tidytable::filter(n > 1)|> 
  pull(hash_key)
invisible("es 0")

# SISTRAT23_c1_2010_2024_df_prev1e |> 
#   dplyr::filter(hash_key %in% hashs_inconsistent_ages_post_rule_based_imp) |> View()

hashs_for_imputation_adm_birth_year<-
tidytable::filter(SISTRAT23_c1_2010_2024_df_prev1e, adm_age_rec>90|adm_age_rec<16|is.na(adm_age_rec))|> distinct(hash_key)

SISTRAT23_c1_2010_2024_df_prev1e$birth_date_rec_imp<-NA
SISTRAT23_c1_2010_2024_df_prev1e$birth_date_rec_imp<-
    ifelse(!is.na(SISTRAT23_c1_2010_2024_df_prev1e$adm_age_rec),
           SISTRAT23_c1_2010_2024_df_prev1e$birth_date_rec,
           SISTRAT23_c1_2010_2024_df_prev1e$birth_date_rec_imp)

invisible("Create database")
dataset_with_na <- SISTRAT23_c1_2010_2024_df_prev1e
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_

set.seed(2125)
#https://mayer79.github.io/missRanger/articles/missRanger.html
SISTRAT23_c1_2010_2024_df_prev1e_imp <- missRanger::missRanger(
    data = dataset_with_na,
    formula = birth_date_rec_imp ~ sexo+ tipo_centro + tipo_de_plan + pais_nacimiento +
              se_trata_de_una_mujer_embarazada + escolaridad_ultimo_ano_cursado +
              sustancia_principal + edad_inicio_sustancia_principal +
              tiene_menores_de_edad_a_cargo + edad_inicio_consumo +
              numero_de_hijos + estado_conyugal+ 
      TABLE_rec+ numero_de_tratamientos_anteriores+ usuario_de_tribunales_tratamiento_drogas, 
    num.trees = 5e3,
    pmm.k = 3,  # Predictive mean matching
    keep_forests = T,
    returnOOB= T, 
    #mtry= function(p) max(3, trunc(p / 3)), # At least 3 or parameters/3, whichever is greater.
    maxiter= 5e2,
    verbose = 2,
    seed= 2125,
    #case.weights = rowSums(!is.na(SISTRAT23_c1_2010_2024_df_prev1f)) #pass case weights to the imputation models. For instance, this allows to reduce the contribution of rows with many missings
  )
paste0("Best iter:", SISTRAT23_c1_2010_2024_df_prev1e_imp$best_iter)

paste0("Mtry: how many covariates are considered in each tree split: ", floor(sqrt(12)))
#Quick and balanced. More if there are many complexities to capture
#Reduce if there a few data of overadjustment

#OOB prediction error per iteration and variable (1 minus R-squared for regression)

#The default mtry in missRanger is sqrt(p), where p is the number of variables in the dataset.
#OOB prediction errors are quantified as 1 - R^2 for numeric variables, and as classification error otherwise. If a variable has been imputed only univariately, the value is 1.
#https://rdrr.io/cran/missRanger/man/missRanger.html

paste0("The model explains ",scales::percent(1-min(SISTRAT23_c1_2010_2024_df_prev1e_imp$mean_pred_errors), accuracy=.1), " of the variance. This is calculated using the out of bag samples in each tree split")
# [1] "The model explains 51.2% of the variance. This is calculated using the out of bag samples in each tree split"

#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_#_
set.seed(2125)
SISTRAT23_c1_2010_2024_df_prev1e_imp_kNN<-VIM::kNN(dataset_with_na, variable = c("birth_date_rec_imp"), dist_var=c("sexo", "tipo_centro", "tipo_de_plan", "pais_nacimiento","se_trata_de_una_mujer_embarazada", "escolaridad_ultimo_ano_cursado", "sustancia_principal", "edad_inicio_sustancia_principal", "tiene_menores_de_edad_a_cargo", "edad_inicio_consumo","numero_de_hijos", "estado_conyugal", "TABLE_rec2", "numero_de_tratamientos_anteriores", "usuario_de_tribunales_tratamiento_drogas"),
                                   numFun = "mean", 
                                   k=3,
                                   trace=T)
#Time difference of 1.000705 hours
Code
SISTRAT23_c1_2010_2024_df_prev1e_imp_kNN$birth_date_rec_imp
as.Date(SISTRAT23_c1_2010_2024_df_prev1e_imp_kNN$birth_date_rec_imp)
#2.2.1.c
#2.2.1.a
#2.2.2.f
#2.2.2.i
#Extremely implausible birth dates (on or after Jan. 1, 2010, or on or before Jan. 1, 1910)= puede que no haya casos con sólo implausibles. Y en ese caso no puedo hacer nada
invisible("HASHs that were filed for probabilistic imputation")
flowchart_red_for_imp<-
unique(c(
unique(SISTRAT23_c1_2010_2024_df_prev1e$hash_key[grepl("2.2.1.c",SISTRAT23_c1_2010_2024_df_prev1e$OBS)]),
unique(SISTRAT23_c1_2010_2024_df_prev1e$hash_key[grepl("2.2.1.e",SISTRAT23_c1_2010_2024_df_prev1e$OBS)]),
unique(SISTRAT23_c1_2010_2024_df_prev1e$hash_key[grepl("2.2.2.f",SISTRAT23_c1_2010_2024_df_prev1e$OBS)]),
unique(SISTRAT23_c1_2010_2024_df_prev1e$hash_key[grepl("2.2.2.i",SISTRAT23_c1_2010_2024_df_prev1e$OBS)])
))
invisible("Gross criteria for discarding HASHs")
discarded_hashs<-
tidytable::filter(SISTRAT23_c1_2010_2024_df_prev1e, adm_age_rec>90|adm_age_rec<16|is.na(adm_age_rec))|> distinct(hash_key) |> pull(hash_key)

invisible("To explore which HASHs are not in the flowchart but fulfill the criteria for discard")
# SISTRAT23_c1_2010_2024_df_prev1f |> 
#   dplyr::filter(hash_key %in% setdiff(discarded_hashs, flowchart_red_for_imp)) |> 
#   janitor::tabyl(OBS, show_na = T) |> 
#   dplyr::arrange(desc(n)) |> 
#   dplyr::mutate(percent = round(percent*100, digits = 1))
# structure(list(OBS = structure(c("2.2.1.g.Only one birth date, no anomalous values, kept the original birth date", 
# "1.1. Duplicated Cases in Almost Every Variable;2.2.2.d.No valid birthdate. Select closest to average", 
# "2.2.2.g.Get birth date year from the average dates of hospital admissions", 
# "2.2.2.d.No valid birthdate. Select closest to average", "", 
# "1.1. Duplicated Cases in Almost Every Variable;2.2.2.b. Used PO date if within percentiles 1 and 99", 
# "1.1. Duplicated Cases in Almost Every Variable;2.2.2.c.Multiple birth dates. Chose the most frequent. If ties, selected the closest to the average date", 
# "2.2.2.b. Used PO date if within percentiles 1 and 99", "1.1. Duplicated Cases in Almost Every Variable;2.1.1.c.Multiple common dates found. Select the birth date closest to available external records;2.2.2.c.Multiple birth dates. Chose the most frequent. If ties, selected the closest to the average date", 
# "1.1. Duplicated Cases in Almost Every Variable;2.2.1.b. Not anomalous adm age, have a PO birth date, replace birth date with the closest birth date to the average birth date", 
# "1.1. Duplicated Cases in Almost Every Variable;2.2.2.e. Single birth date available, selected", 
# "1.1. Duplicated Cases in Almost Every Variable;2.2.2.g.Get birth date year from the average dates of hospital admissions", 
# "2.1.1.c.Multiple common dates found. Select the birth date closest to available external records;2.2.2.c.Multiple birth dates. Chose the most frequent. If ties, selected the closest to the average date", 
# "2.2.1.b. Not anomalous adm age, have a PO birth date, replace birth date with the closest birth date to the average birth date", 
# "2.2.2.a. Prioritized birthdate of mortality database", "2.2.2.c.Multiple birth dates. Chose the most frequent. If ties, selected the closest to the average date"
# ), class = c("glue", "character")), n = c(10L, 5L, 4L, 3L, 2L, 
# 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), percent = c(26.3, 
# 13.2, 10.5, 7.9, 5.3, 5.3, 5.3, 5.3, 2.6, 2.6, 2.6, 2.6, 2.6, 
# 2.6, 2.6, 2.6)), row.names = c(NA, -16L), class = c("tabyl", 
# "data.frame"))

#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:

invisible("Ver caso 8f20a93917a8d893793661630272c312df4d429901dfcf4fe859b95f379e70d8, tiene una edad de 22 ademas de una mala")
Code
SISTRAT23_c1_2010_2024_df_prev1f<-
SISTRAT23_c1_2010_2024_df_prev1e|>
      (\(df) {
        message(paste0("Discarding infrequent & missing adm ages before rule-based imp, Entries: ", nrow(df)))
        message(paste0("Discarding infrequent & missing adm ages before rule-based imp, RUNs: ", tidytable::distinct(df, hash_key) |> nrow()))
        df
    })() |> 
  tidytable::filter(adm_age_rec<=90, adm_age_rec>=13, !is.na(adm_age_rec)) |> 
      (\(df) {
        message(paste0("Discarding infrequent & missing adm ages after rule-based imp, Entries: ", nrow(df)))
        message(paste0("Discarding infrequent & missing adm ages after rule-based imp, RUNs: ", tidytable::distinct(df, hash_key) |> nrow()))
        df
    })()

Discarding infrequent & missing adm ages before rule-based imp, Entries: 174125

Discarding infrequent & missing adm ages before rule-based imp, RUNs: 121463

Discarding infrequent & missing adm ages after rule-based imp, Entries: 174110

Discarding infrequent & missing adm ages after rule-based imp, RUNs: 121448

Code
# 150133 #150190
# 106227  #106284
# AUG 2025
# Discarding infrequent & missing adm ages before rule-based imp, Entries: 174125
# Discarding infrequent & missing adm ages before rule-based imp, RUNs: 121463
# Discarding infrequent & missing adm ages after rule-based imp, Entries: 174110
# Discarding infrequent & missing adm ages after rule-based imp, RUNs: 121448
invisible("Por tanto, sacamos a 21 usuarios; AUG 2025= sacamos a 15 usuarios")


3. Standardization of Some Variables

In this stage, we implemented most of the recommendations made by SENDA’s professionals:

  • Deleted cases categorized as Parole (n = 1)
  • Collapse the different plan types (plan_type) into the following: pg-pab, pg-pai, pg-pr, m-pab, m-pai, and m-pr. M-PAB programs were identified (n=82), although they should not have been in operation. This issue was documented some time ago, possibly as part of a pilot initiative.
  • Declared the age of onset of drug use as invalid if it exceeded the current age (age_subs_onset).
  • Declared the age of drug use for the main substance as invalid if inconsistent (age_prim_subs_onset).
  • Changed the name of the variable of admission age from adm_age_rec to adm_age_rec
  • Defined early and late drop-outs within compliance with treatment (tr_compliance): not completed- early [<90 days] discharge, not completed- late [>=90 days] discharge, early adm. discharge (dismissal due to misconduct), death, completed, referral, currently in. Cases that were classified as death as a cause of discharge were transformed into administrative discharge to ensure consistency with other cases.
  • Collapsed and standardized substances (first_sub_used, primary_sub, second_sub1, second_sub2, second_sub3) into the following categories: hallucinogens (e.g., LSD, mushrooms), cocaine, amphetamine-type stimulants (e.g., methamphetamine, ecstasy), inhalants, marijuana, opioids (e.g., heroin, methadone, painkillers), cocaine paste base and crack, sedatives, hypnotics and tranquilizers, and others (e.g., anabolic steroids). Records without secondary substance info (CIP- Temporary Detention Center [Centro de internación provisoria] or CRC- Closed Regime Detention Center[Centro de Régimen Cerrado]), without consumption, or with an unspecified primary substance category were marked as missing values. Entries containing “alcohol” were classified as Alcohols; Cocaine – paste: “pasta base”, “basuco”, “crack”; Cocaine – powder: “cocaína”/“cocaine”; Cannabis: “marihuana”/“cannabis”; Dissociatives: “ketamine”/“tussi (tusi)”/“PCP (phencyclidine)”/“DXM”; Hallucinogens: “LSD”, “mushrooms/psilocybin”, “ayahuasca”, “mescaline”, “2C-”, “NBOMe”, “DMT”; Amphetamine-type stimulants (ATS): “amphetamines/methamphetamines/MDMA (ecstasy)/methylphenidate/ritalin/estimul”; Tranquilizers/Hypnotics: benzodiazepines and related (“sedante”/“hipnótico”/“tranquiliz”/“benzodiaz”); Inhalants: “inhalables”, neoprene glue, nitrous oxide, poppers, solvents, gasoline, thinner; Opioids: methadone, codeine, tramadol, morphine, meperidine/demerol, oxycodone, hydrocodone, hydromorphone, buprenorphine, tapentadol, “otros opioides analgésicos”, fentanyl/“fenta”; Others: steroids/“otros”; Missing/NA: blanks, “sin consumo”, “sin especificar”.
  • Collapsed marital status (marital_status) into the following categories: married, cohabiting or in shared living arrangements (civil union or de facto), separated or divorced, single, and widowed. Annulled status was coded into divorced (due to the possibility of having children and small size ~.01%), and who do not answer were marked as missing values.
  • Collapsed occupational condition (occupation_condition) into an occupational status composed of three categories: employed, unemployed, and inactive.
  • Assigned an occupational status (occupation_status) only to records with occupational conditions marked as employed, leaving others as missing. Finally, standardized these categories into English labels.
  • Translated labels of Biopsychosocial compromise and ordered according to complexity (biopsych_comp).
  • Translated labels of Drug dependence diagnosis (sub_dep_icd10_status)
  • Tenure status of households were collapsed into Stays temporarily with a relative, Owner/Transferred dwellings/Pays Dividends, Illegal Settlement, and Others (~3.0%).
  • Treatment admission motive (adm_motive) was collapsed into Spontaneous consultation, Sanitary sector referral, Justice sector referral, SENDA-related (Another SUD center/FONODROGAS/ SENDA Previene), and Other (~4.6%)
  • Collapsed educational attainment into three categories (ed_attainment): completed primary school or less, completed or incomplete high school, and more than high school. Categories such as education for students with disabilities were classified in completed primary school. The ‘Don’t know/No response’ categories were classified as invalid.
  • Declared the category “Unknown” for the route of administration of the main substance as an invalid value.
  • Declared the category “Unknown” for the frequency of consumption of the primary substance (prim_sub_freq) as an invalid value, added to translation of labels and collapsing “did not use” category with less than 1 day a week frequency into “less than than 1 day a week”.
  • Created a variable with English labels for the type of center (type_center).
  • The commune/municipallity of residence was recoded to include the unique territorial code used before 2018 (municipallity_res_cutpre18).
  • Created the classifiction of macrozone of residency of patient” pregnant (collected only since 2017).
  • Created the variable macrozone_center which tags every record collapsing Chile’s 16 regions into 6 macro-zones: 1.North – far-north desert regions (Arica y Parinacota, Tarapacá, Antofagasta, Atacama); 2.Center – semi-arid farming and coastal resort areas (Coquimbo, Valparaíso); 3.South-center – central valley heartland and early industrial zones (O’Higgins, Maule, Ñuble/Bío-Bío); 4.South – rainy lake-and-volcano districts (Araucanía, Los Ríos, Los Lagos); 5.Austral – Patagonia and sub-Antarctic territory (Magallanes, Aysén); 0. Metropolitan – Santiago metro region (anything not explicitly listed).
  • Created the translated variable pregnant (collected only since 2017).
  • Created the translated variable pregnant_disch to capture if the patient has been pregnant at discharge (available only since 2017).
  • Created the translated variable adm_disch_reason : death, rule violation, agreement ended, no local service


Code
municipality_map <- c('algarrobo'='algarrobo (6502)',
      'alhue'='alhué (13502)',
      'alto del carmen'='alto del carmen (3302)',
      'alto hospicio'='alto hospicio (1107)',
      'ancud'='ancud (10202)',
      'andacollo'='andacollo (4103)',
      'angol'='angol (9201)',
      'antofagasta'='antofagasta (2101)',
      'antuco'='antuco (8302)',
      'arauco'='arauco (8202)',
      'arica'='arica (15101)',
      'aysen'='aysén (11201)',
      'buin'='buin (13402)',
      'bulnes'='bulnes * (8402)',
      'cabildo'='cabildo (5402)',
      'cabrero'='cabrero (8303)',
      'calama'='calama (2201)',
      'calbuco'='calbuco (10102)',
      'caldera'='caldera (3102)',
      'calera de tango'='calera de tango (13403)',
      'calle larga'='calle larga (5302)',
      'camarones'='camarones (15102)',
      'camina'='camina (1402)',
      'canela'='canela (4202)',
      'canete'='canete (8203)',
      'carahue'='carahue (9102)',
      'cartagena'='cartagena (5603)',
      'casablanca'='casablanca (5102)',
      'castro'='castro (10201)',
      'catemu'='catemu (5702)',
      'cauquenes'='cauquenes (7201)',
      'cerrillos'='cerrillos (13102)',
      'cerro navia'='cerro navia (13103)',
      'chanaral'='chanaral (3201)',
      'chanco'='chanco (7202)',
      'chepica'='chépica (6302)',
      'chiguayante'='chiguayante (8103)',
      'chile chico'='chile chico (11401)',
      'chillan'='chillán * (8401)',
      'chillan viejo'='chillán viejo * (8406)',
      'chimbarongo'='chimbarongo (6303)',
      'cholchol'='cholchol (9121)',
      'chonchi'='chonchi (10203)',
      'cisnes'='cisnes (11202)',
      'cobquecura'='cobquecura * (8403)',
      'cochamo'='cochamó (10103)',
      'cochrane'='cochrane (11301)',
      'codegua'='codegua (6102)',
      'coelemu'='coelemu * (8404)',
      'coihueco'='coihueco * (8405)',
      'coinco'='coinco (6103)',
      'colbun'='colbún (7402)',
      'colchane'='colchane (1403)',
      'colina'='colina (13301)',
      'collipulli'='collipulli (9202)',
      'coltauco'='coltauco (6104)',
      'combarbala'='combarbalá (4302)',
      'concepcion'='concepción (8101)',
      'conchali'='conchalí (13104)',
      'concon'='concón (5103)',
      'constitucion'='constitución (7102)',
      'contulmo'='contulmo (8204)',
      'copiapo'='copiapó (3101)',
      'coquimbo'='coquimbo (4102)',
      'coronel'='coronel (8102)',
      'corral'='corral (14102)',
      'coyhaique'='coyhaique (11101)',
      'cunco'='cunco (9103)',
      'curacautin'='curacautín (9203)',
      'curacavi'='curacaví (13503)',
      'curaco de velez'='curaco de vélez (10204)',
      'curanilahue'='curanilahue (8205)',
      'curarrehue'='curarrehue (9104)',
      'curepto'='curepto (7103)',
      'curico'='curicó (7301)',
      'dalcahue'='dalcahue (10205)',
      'diego de almagro'='diego de almagro (3202)',
      'donihue'='donihue (6105)',
      'el bosque'='el bosque (13105)',
      'el carmen'='el carmen * (8407)',
      'el monte'='el monte (13602)',
      'el quisco'='el quisco (5604)',
      'el tabo'='el tabo (5605)',
      'empedrado'='empedrado (7104)',
      'ercilla'='ercilla (9204)',
      'estacion central'='estación central (13106)',
      'florida'='florida (8104)',
      'freire'='freire (9105)',
      'freirina'='freirina (3303)',
      'fresia'='fresia (10104)',
      'frutillar'='frutillar (10105)',
      'futrono'='futrono (14202)',
      'gorbea'='gorbea (9107)',
      'graneros'='graneros (6106)',
      'guaitecas'='guaitecas (11203)',
      'hijuelas'='hijuelas (5503)',
      'hualaihue'='hualaihué (10403)',
      'hualane'='hualané (7302)',
      'hualpen'='hualpén (8112)',
      'hualqui'='hualqui (8105)',
      'huara'='huara (1404)',
      'huasco'='huasco (3304)',
      'huechuraba'='huechuraba (13107)',
      'illapel'='illapel (4201)',
      'independencia'='independencia (13108)',
      'iquique'='iquique (1101)',
      'isla de maipo'='isla de maipo (13603)',
      'isla de pascua'='isla de pascua (5201)',
      'juan fernandez'='juan fernández (5104)',
      'la calera'='la calera (5502)',
      'la cisterna'='la cisterna (13109)',
      'la cruz'='la cruz (5504)',
      'la estrella'='la estrella (6202)',
      'la florida'='la florida (13110)',
      'la granja'='la granja (13111)',
      'la higuera'='la higuera (4104)',
      'la ligua'='la ligua (5401)',
      'la pintana'='la pintana (13112)',
      'la reina'='la reina (13113)',
      'la serena'='la serena (4101)',
      'la union'='la unión (14201)',
      'lago ranco'='lago ranco (14203)',
      'lago verde'='lago verde (11102)',
      'laja'='laja (8304)',
      'lampa'='lampa (13302)',
      'lanco'='lanco (14103)',
      'las cabras'='las cabras (6107)',
      'las condes'='las condes (13114)',
      'lautaro'='lautaro (9108)',
      'lebu'='lebu (8201)',
      'licanten'='licantén (7303)',
      'limache'='limache (5802)',
      'linares'='linares (7401)',
      'litueche'='litueche (6203)',
      'llanquihue'='llanquihue (10107)',
      'llay llay'='llaillay (5703)',
      'lo barnechea'='lo barnechea (13115)',
      'lo espejo'='lo espejo (13116)',
      'lo prado'='lo prado (13117)',
      'lolol'='lolol (6304)',
      'loncoche'='loncoche (9109)',
      'longavi'='longaví (7403)',
      'lonquimay'='lonquimay (9205)',
      'los alamos'='los álamos (8206)',
      'los andes'='los andes (5301)',
      'los angeles'='los ángeles (8301)',
      'los lagos'='los lagos (14104)',
      'los muermos'='los muermos (10106)',
      'los sauces'='los sauces (9206)',
      'los vilos'='los vilos (4203)',
      'lota'='lota (8106)',
      'lumaco'='lumaco (9207)',
      'machali'='machalí (6108)',
      'macul'='macul (13118)',
      'mafil'='máfil (14105)',
      'maipu'='maipú (13119)',
      'malloa'='malloa (6109)',
      'marchigue'='marchihue (6204)',
      'maria elena'='maría elena (2302)',
      'maria pinto'='maría pinto (13504)',
      'mariquina'='mariquina (14106)',
      'maule'='maule (7105)',
      'maullin'='maullín (10108)',
      'mejillones'='mejillones (2102)',
      'melipeuco'='melipeuco (9110)',
      'melipilla'='melipilla (13501)',
      'molina'='molina (7304)',
      'monte patria'='monte patria (4303)',
      'mulchen'='mulchén (8305)',
      'nacimiento'='nacimiento (8306)',
      'nancagua'='nancagua (6305)',
      'navarino'='cabo de hornos (12201)',
      'navidad'='navidad (6205)',
      'negrete'='negrete (8307)',
      'ninhue'='ninhue * (8408)',
      'niquen'='niquén * (8409)',
      'nogales'='nogales (5506)',
      'nueva imperial'='nueva imperial (9111)',
      'nunoa'='nunoa (13120)',
      "o´higgins"="o'higgins (11302)",
      "o'higgins"="o'higgins (11302)",
      'olivar'='olivar (6111)',
      'ollagüe'='ollagüe (2202)',
      'ollague'='ollagüe (2202)',
      'olmue'='olmué (5803)',
      'osorno'='osorno (10301)',
      'ovalle'='ovalle (4301)',
      'padre hurtado'='padre hurtado (13604)',
      'padre las casas'='padre las casas (9112)',
      'paihuano'='paiguano (4105)',
      'paillaco'='paillaco (14107)',
      'paine'='paine (13404)',
      'palena'='palena (10404)',
      'palmilla'='palmilla (6306)',
      'panguipulli'='panguipulli (14108)',
      'panquehue'='panquehue (5704)',
      'papudo'='papudo (5403)',
      'paredones'='paredones (6206)',
      'parral'='parral (7404)',
      'pedro aguirre cerda'='pedro aguirre cerda (13121)',
      'pelarco'='pelarco (7106)',
      'pelluhue'='pelluhue (7203)',
      'pemuco'='pemuco * (8410)',
      'penaflor'='penaflor (13605)',
      'penalolen'='penalolén (13122)',
      'pencahue'='pencahue (7107)',
      'penco'='penco (8107)',
      'peralillo'='peralillo (6307)',
      'perquenco'='perquenco (9113)',
      'petorca'='petorca (5404)',
      'peumo'='peumo (6112)',
      'pica'='pica (1405)',
      'pichidegua'='pichidegua (6113)',
      'pichilemu'='pichilemu (6201)',
      'pinto'='pinto * (8411)',
      'pirque'='pirque (13202)',
      'pitrufquen'='pitrufquén (9114)',
      'placilla'='placilla (6308)',
      'portezuelo'='portezuelo * (8412)',
      'porvenir'='porvenir (12301)',
      'pozo almonte'='pozo almonte (1401)',
      'primavera'='primavera (12302)',
      'providencia'='providencia (13123)',
      'puchuncavi'='puchuncaví (5105)',
      'pucon'='pucón (9115)',
      'pudahuel'='pudahuel (13124)',
      'puente alto'='puente alto (13201)',
      'puerto montt'='puerto montt (10101)',
      'puerto natales'='puerto natales (12401)',
      'puerto octay'='puerto octay (10302)',
      'puerto saavedra'='puerto saavedra (9116)',
      'puerto varas'='puerto varas (10109)',
      'pumanque'='pumanque (6309)',
      'punitaqui'='punitaqui (4304)',
      'punta arenas'='punta arenas (12101)',
      'puqueldon'='puqueldón (10206)',
      'puren'='purén (9208)',
      'purranque'='purranque (10303)',
      'putaendo'='putaendo (5705)',
      'puyehue'='puyehue (10304)',
      'queilen'='queilén (10207)',
      'quellon'='quellón (10208)',
      'quemchi'='quemchi (10209)',
      'quilaco'='quilaco (8308)',
      'quilicura'='quilicura (13125)',
      'quilleco'='quilleco (8309)',
      'quillon'='quillón * (8413)',
      'quillota'='quillota (5501)',
      'quilpue'='quilpué (5801)',
      'quinchao'='quinchao (10210)',
      'quinta de tilcoco'='quinta de tilcoco (6114)',
      'quinta normal'='quinta normal (13126)',
      'quintero'='quintero (5107)',
      'quirihue'='quirihue * (8414)',
      'rancagua'='rancagua (6101)',
      'ranquil'='ránquil * (8415)',
      'rauco'='rauco (7305)',
      'recoleta'='recoleta (13127)',
      'renaico'='renaico (9209)',
      'renca'='renca (13128)',
      'rengo'='rengo (6115)',
      'requinoa'='requínoa (6116)',
      'retiro'='retiro (7405)',
      'rinconada'='rinconada (5303)',
      'rio bueno'='río bueno (14204)',
      'rio claro'='río claro (7108)',
      'rio hurtado'='río hurtado (4305)',
      'rio negro'='río negro (10305)',
      'romeral'='romeral (7306)',
      'sagrada familia'='sagrada familia (7307)',
      'salamanca'='salamanca (4204)',
      'san antonio'='san antonio (5601)',
      'san bernardo'='san bernardo (13401)',
      'san carlos'='san carlos * (8416)',
      'san clemente'='san clemente (7109)',
      'san esteban'='san esteban (5304)',
      'san felipe'='san felipe (5701)',
      'san fernando'='san fernando (6301)',
      'san francisco de mostazal'='san francisco de mostazal (6110)',
      'san gregorio de niquen'='san gregorio (12104)',
      'san ignacio'='san ignacio * (8418)',
      'san javier'='san javier (7406)',
      'san joaquin'='san joaquín (13129)',
      'san jose de maipo'='san josé de maipo (13203)',
      'san juan de la costa'='san juan de la costa (10306)',
      'san miguel'='san miguel (13130)',
      'san nicolas'='san nicolas (8419)',
      'san pablo'='san pablo (10307)',
      'san pedro'='san pedro (13505)',
      'san pedro de atacama'='san pedro de atacama (2203)',
      'san pedro de la paz'='san pedro de la paz (8108)',
      'san rafael'='san rafael (7110)',
      'san ramon'='san ramón (13131)',
      'san rosendo'='san rosendo (8310)',
      'san vicente'='san vicente (6117)',
      'santa barbara'='santa bárbara (8311)',
      'santa cruz'='santa cruz (6310)',
      'santa juana'='santa juana (8109)',
      'santa maria'='santa maría (5706)',
      'santiago centro'='santiago (13101)',
      'santiago oeste'='santiago (13101)',
      'santiago sur'='santiago (13101)',
      'santo domingo'='santo domingo (5606)',
      'sierra gorda'='sierra gorda (2103)',
      'talagante'='talagante (13601)',
      'talca'='talca (7101)',
      'talcahuano'='talcahuano (8110)',
      'taltal'='taltal (2104)',
      'temuco'='temuco (9101)',
      'teno'='teno (7308)',
      'teodoro schmidt'='teodoro schmidt (9117)',
      'tierra amarilla'='tierra amarilla (3103)',
      'til-til'='tiltil (13303)',
      'tirua'='tirúa (8207)',
      'tocopilla'='tocopilla (2301)',
      'tolten'='toltén (9118)',
      'tome'='tomé (8111)',
      'torres del paine'='torres del paine (12402)',
      'tortel'='tortel (11303)',
      'traiguen'='traiguén (9210)',
      'trehuaco'='treguaco * (8420)',
      'tucapel'='tucapel (8312)',
      'valdivia'='valdivia (14101)',
      'vallenar'='vallenar (3301)',
      'valparaiso'='valparaíso (5101)',
      'vichuquen'='vichuquén (7309)',
      'victoria'='victoria (9211)',
      'vicuna'='vicuna (4106)',
      'vilcun'='vilcún (9119)',
      'villa alegre'='villa alegre (7407)',
      'villa alemana'='villa alemana (5804)',
      'villarrica'='villarrica (9120)',
      'vina del mar'='vina del mar (5109)',
      'vitacura'='vitacura (13132)',
      'yerbas buenas'='yerbas buenas (7408)',
      'yumbel'='yumbel (8313)',
      'yungay'='yungay * (8421)',
      'zapallar'='zapallar (5405)')

SISTRAT23_c1_2010_2024_df_prev1g<-
SISTRAT23_c1_2010_2024_df_prev1f|>
  #:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
  #Deletion of Paroles and missing values
      (\(df) {
        message(paste0("Discarding parole & missing plans, Entries: ", nrow(df)))
        message(paste0("Discarding parole & missing plans, RUNs: ", tidytable::distinct(df, hash_key) |> nrow()))
        df
    })()|> 
  tidytable::filter(tipo_de_plan != "pai pv")|>
  tidytable::filter(tipo_de_plan != "pai lv")|> 
  tidytable::mutate(pub_center= factor(if_else(as.character(tipo_centro)=="publico",TRUE,FALSE,NA)))|>
  #:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
  #Collapsed treatment plans
  tidytable::mutate(OBS= tidytable::case_when(tipo_de_plan %in% c("m pai (p)","m-pr2","pg pai 2", "pg pai 2", "otro", "calle") ~paste0(as.character(OBS),";","3.1. Collapsed Treatment Plans"), TRUE ~ as.character(OBS)))|> 
  tidytable::mutate(plan_type = tidytable::recode(tipo_de_plan,
     "m pai (p)" = "m-pai",
     "m-pai2" = "m-pai",
     "m-pab" = "m-pab",
     "m-pr2" = "m-pr",
     "pg pai 2" = "pg-pai",
     "pg-pr 2" = "pg-pr",
     "pg -pab 2" = "pg-pab",
     "otro" = "pg-pr",
     "calle" = "pg-pr"))|> 
  #:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
  #invalid age of substance use onset (vs. admission age)
  #tidytable::rename("adm_age_rec"="adm_age_rec")|>
  tidytable::mutate(age_subs_onset= ifelse(edad_inicio_consumo<= adm_age_rec, edad_inicio_consumo, NA))|>
  tidytable::mutate(OBS= case_when(edad_inicio_consumo>adm_age_rec ~ paste0(as.character(OBS),";","3.2. Invalid Age Of Onset of Substance use, Higher than admission age"), TRUE ~ as.character(OBS)))|>
  tidytable::mutate(age_prim_subs_onset= ifelse(edad_inicio_sustancia_principal<= adm_age_rec, edad_inicio_sustancia_principal, NA))|>
  tidytable::mutate(OBS= case_when(edad_inicio_sustancia_principal>adm_age_rec ~ paste0(as.character(OBS),";","3.3. Invalid Age Of Onset of Primary Substance use, Higher than admission age"), TRUE ~ as.character(OBS)))|>
  #:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
  #invalid age of substance use onset (< 5 yrs old)
  tidytable::mutate(OBS=case_when(age_subs_onset< 5 & age_subs_onset>=0 ~ paste0(as.character(OBS),";","3.4. Invalid Age Of Onset of Substance use, <5 yrs old"), TRUE ~ as.character(OBS)))|>
  tidytable::mutate(OBS=case_when(age_prim_subs_onset< 5 & age_prim_subs_onset>=0 ~ paste0(as.character(OBS),";","3.5. Invalid Age Of Onset of Primary Substance use,  <5 yrs old"), TRUE ~ as.character(OBS)))|>
  tidytable::mutate(age_subs_onset= ifelse(age_subs_onset< 5 & age_subs_onset>=0, NA, age_subs_onset))|>
  tidytable::mutate(age_prim_subs_onset= ifelse(age_prim_subs_onset< 5 & age_prim_subs_onset>=0, NA, age_prim_subs_onset))|>
  #:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
  #Negative tr. days
  tidytable::mutate(OBS=case_when(dit_rec<0 ~ paste0(as.character(OBS),";","3.6. Negative Treatment Days, Changed Treat Days"),T ~ as.character(OBS)))|>
  tidytable::mutate(dit_rec= ifelse(dit_rec<0, NA, dit_rec))|>
  #:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
  #Early vs. late dropout
  tidytable::mutate(dit_earl_drop= ifelse(dit_rec>=90 & !is.na(dit_rec),0,1))|>
  #changed the order of the labels (2025-06-02)
  tidytable::mutate(dit_earl_drop= factor(dit_earl_drop, labels=c(">= 90 days","<90 days")))|> #t.test(dit_rec~ dit_earl_drop, data= df)
  #:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
  #Treatment compliance
  tidytable::mutate(motivo_de_egreso= ifelse(motivo_de_egreso=="muerte","alta adminsitrativa", motivo_de_egreso))|>
  tidytable::mutate(tr_compliance= case_when(grepl("<",dit_earl_drop) & motivo_de_egreso=="abandono"~ "early dropout", grepl(">",dit_earl_drop) & motivo_de_egreso=="abandono"~ "late dropout", grepl("<",dit_earl_drop) & grepl("adm", motivo_de_egreso)~ "early adm discharge", grepl(">",dit_earl_drop) & grepl("adm", motivo_de_egreso)~ "late adm discharge", motivo_de_egreso=="alta terapeutica"~ "completion", motivo_de_egreso=="muerte"~ "death", motivo_de_egreso=="derivacion"~ "referral", is.na(motivo_de_egreso)~ "currently in", TRUE~ "other"))|>
  #:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
  #Referral source
  tidytable::mutate(referral_type = tidytable::case_when(
    tipo_centro_derivacion %in% c("comunidad terapeutica residencial",
                                          "clinica residencial") ~ "residential SUD treatment",
    tipo_centro_derivacion %in% c("comunidad terapeutica ambulatoria",
                                          "clinica ambulatoria") ~ "ambulatory SUD treatment",
    tipo_centro_derivacion %in% c("cosam", "aps") ~ "primary health care",
    stringr::str_detect(tipo_centro_derivacion, "^hospital") ~ "secondary health care",
    tipo_centro_derivacion == "otro centro" ~ "other facility",
    TRUE ~ NA_character_))|>
  #:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
  #Substances (first used, primary and secondary), coding
  tidytable::mutate(first_sub_used= sustancia_de_inicio, primary_sub= sustancia_principal, second_sub1= otras_sustancias_no1, second_sub2= otras_sustancias_no2, second_sub3= otras_sustancias_no3)|>
  tidytable::mutate(OBS= case_when(grepl("especif|sin consumo", first_sub_used)|grepl("especif|cip-crc|sin consumo", primary_sub)|grepl("especif|cip-crc|sin consumo", second_sub1)| grepl("especif|cip-crc|sin consumo", second_sub2)|grepl("especif|cip-crc|sin consumo", second_sub3) ~ paste0(OBS,";","3.7. Secondary substances, invalid due to no consumption /unspecified"), TRUE ~ OBS))|>
  tidytable::mutate(
    across(
      all_of(c("first_sub_used", "primary_sub", "second_sub1", "second_sub2", "second_sub3")),
      ~ tidytable::case_when(
        #Unknown&No substance use: #SEP= Did not consider: sin sustancia principal (cip-crc)
        grepl("^\\s*$|especif|sin consumo", ., ignore.case = TRUE) ~ NA_character_,
        #Main substances
        grepl("alcohol",                    ., ignore.case = TRUE) ~ "alcohol",
        grepl("pasta|crack",                ., ignore.case = TRUE) ~ "cocaine paste",
        grepl("cocain|cocaina",             ., ignore.case = TRUE) ~ "cocaine powder",
        grepl("marihuan|cannab",            ., ignore.case = TRUE) ~ "marijuana",
        # Disociativos (corrige tussi/ketamina y PCP)
        grepl("ketamin|tussi|tusi|pcp|fenilciclidina|dxm", ., ignore.case = TRUE) ~ "dissociatives",
        # Alucinógenos
        grepl("lsd|hongos|psiloc|ayahuas|mescal|2c|nbome|dmt", ., ignore.case = TRUE) ~ "hallucinogens",
        # Estimulantes tipo anfetamina
        grepl("anfet|metanfet|extasis|mdma|metilfenidato|ritalin|\\bestimul", ., ignore.case = TRUE) ~
          "amphetamine-type stimulants",
        # Tranquilizantes / hipnóticos
        grepl("sedante|hipnotic|tranquiliz|benzodiaz", ., ignore.case = TRUE) ~ "tranquilizers/hypnotics",
        # Inhalantes
        grepl("inhalable|neopren|nitroso|poppers|solvente|gasolina|diluyente", ., ignore.case = TRUE) ~ "inhalants",
        # Opioides
        grepl("heroina|fentanil|fenta|metadona|codein|tramadol|morfina|meperidin|demerol|oxicod|hidrocod|hidromorf|buprenor|tapentadol|otros opioides analgesicos", ., ignore.case = TRUE) ~ "opioids",
        # Otros
        grepl("esteroide|esteroid|otros", ., ignore.case = TRUE) ~ "others",
        TRUE ~ .)))|>
  #:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
  #Marital status
  tidytable::mutate(marital_status= tidytable::case_when(grepl("casado|conviviente",estado_conyugal)~"married/cohabiting", grepl("separado|divorciado|anulado", estado_conyugal)~"separated/divorced/annulled", estado_conyugal=="soltero"~"single", estado_conyugal=="viudo"~"widowed", TRUE~NA_character_))|>
  tidytable::mutate(OBS=case_when(grepl("contesta",estado_conyugal)~ paste0(OBS,";","3.8. Marital State, Invalid due to No Response"), TRUE ~ OBS))|>
  #:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
  #occupational condition= employed, unemployed, inactive
  tidytable::mutate(occupation_condition= case_when(grepl("trabajando", condicion_ocupacional)~"employed", grepl("desempleado", condicion_ocupacional)~"unemployed", grepl("buscando|cesante", condicion_ocupacional)~"unemployed", TRUE~ "inactive"))|>
  #:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
  #Occupation status= collapsing and translating
  tidytable::mutate(occupation_status= tidytable::case_when(occupation_condition!="employed"~ NA_character_, T~categoria_ocupacional))|>
  tidytable::mutate(occupation_status= case_when(occupation_status=="asalariado"~ "salaried",  occupation_status=="cuenta propia"~ "Self-employed", grepl("volunt",occupation_status)~ "volunteer worker", grepl("familiar",occupation_status)~ "unpaid family labour", grepl("otros",occupation_status)~ "other", T~ occupation_status))|>
  #:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
  #Biopsychosocial compromise: translating and ordering
  tidytable::mutate(biopsych_comp= tidytable::case_when(compromiso_biopsicosocial=="leve"~'1-mild',  compromiso_biopsicosocial=="moderado"~ '2-moderate', compromiso_biopsicosocial=="severo"~ '3-severe', TRUE~ NA_character_))|>
  #:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
  #Substance dependency diagnoses : translating 
  tidytable::mutate(sub_dep_icd10_status= dplyr::case_when(grepl("perj",diagnostico_trs_consumo_sustancia)~"hazardous consumption",grepl("dep",diagnostico_trs_consumo_sustancia)~"drug dependence",TRUE~NA_character_))|>
  #:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
  #Tenure status of household: collapsing and translating
  tidytable::mutate(tenure_status_household= case_when(tenencia_de_la_vivienda=="allegado"~"stays temporarily with a relative", tenencia_de_la_vivienda=="arrienda"~"renting", grepl("cedida|dividendo|propia", tenencia_de_la_vivienda)~ "owner/transferred dwellings/pays dividends", tenencia_de_la_vivienda=="ocupacion irregular"~"illegal settlement", tenencia_de_la_vivienda=="otros"~"others", T~NA_character_))|> 
  #:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
  ##Treatment Admission Motive: collapsing and translating
  tidytable::mutate(adm_motive= tidytable::case_when(origen_de_ingreso=="consulta espontanea"~"spontaneous consultation", grepl("aps|red de salud", origen_de_ingreso)~"sanitary sector", grepl("juzgado|fiscalia|vigilada", origen_de_ingreso)~"justice sector", grepl("educacional|trabajo|servicios sociales|otros$", origen_de_ingreso)~"other", grepl("educacional|trabajo|otro centro|fonodrogas|previene$", origen_de_ingreso)~"another SUD facility/FONODROGAS/SENDA Previene", T~NA_character_))|>

    #:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:  
  #Educational attainment: translated labels, collapsed into three categories and labelled as invalid "non responses" and ordered according to marginalization due to deprivation
  #Labels: '1-More than high school','2-Completed high school or less','3-Completed primary school or less'
  tidytable::mutate(ed_attainment= tidytable::case_when(grepl("basica|basica|sin estudios|primaria|kinder|sala cuna|jardin|nunca|especial", escolaridad_ultimo_ano_cursado)~ "3-Completed primary school or less", grepl("media|cientifico|humanidades", escolaridad_ultimo_ano_cursado)~ "2-Completed high school or less", grepl("profesional|tecnica|tecnico|universitaria|doctorado|magister", escolaridad_ultimo_ano_cursado)~"1-More than high school"))|> 
  tidytable::mutate(OBS= case_when(grepl("no sabe", escolaridad_ultimo_ano_cursado)~paste0(OBS,";","3.9. Educational attainment, invalid due to non-response"), TRUE ~ OBS))|> 

  #:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:  
  #Primary Substance at Admission Usage Frequency
  tidytable::mutate(OBS= tidytable::case_when(frecuencia_de_consumo_sustancia_principal=="desconocido" ~ paste0(as.character(OBS),";","3.10. Unknown report of primary substance use frequency"), TRUE ~ as.character(OBS)))|>
  tidytable::mutate(prim_sub_freq= case_when(grepl("menos de|no consumio", frecuencia_de_consumo_sustancia_principal)~ "1. Less than 1 day a week", grepl("1 dias - semana", frecuencia_de_consumo_sustancia_principal)~"2. 1 day a week", grepl("2-3 dias", frecuencia_de_consumo_sustancia_principal)~ "3. 2 to 3 days a week", grepl("4-6 dias", frecuencia_de_consumo_sustancia_principal)~ "4. 4 to 6 days a week", grepl("todos", frecuencia_de_consumo_sustancia_principal)~ "5. Daily", frecuencia_de_consumo_sustancia_principal=="desconocido"~ NA_character_, T~ NA_character_))|>
  #:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:  
  #Type of center
  tidytable::mutate(type_center= case_when(tipo_centro=="publico"~"public", tipo_centro=="privado"~"private", T~NA_character_))|> 
  #:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:  
  #Primary Substance at Admission Usage Route  
  tidytable::mutate(prim_sub_route= tidytable::case_when(grepl("intranasal",via_administracion_sustancia_principal)~"Intranasal (powder aspiration)", grepl("fumada",via_administracion_sustancia_principal)~"Smoked or pulmonary aspiration", grepl("inyectada",via_administracion_sustancia_principal)~"Injected intravenously or intramuscularly", grepl("oral",via_administracion_sustancia_principal)~"Oral (drunk or eaten)", via_administracion_sustancia_principal=="otros"~ "Others", T~ NA_character_))|>
  #:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:  #Regional Location of Center: added territorial code previous to 2018 given the majority of data is before 2018
  tidytable::mutate(municipallity_res_cutpre18= tidytable::recode(comuna_residencia, !!!municipality_map)) |> 
        (\(df) {
        message(paste0("After normalization, Entries: ", nrow(df)))
        message(paste0("After normalization, RUNs: ", tidytable::distinct(df, hash_key) |> nrow()))
        df
    })()|> 
  #:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:  
  # Mortality paper: macrozone
  tidytable::mutate(macrozone_center = tidytable::case_when(
    region_del_centro  %in% c("de arica y parinacota", "de tarapaca", "de antofagasta", "de atacama") ~ "1.North",
    region_del_centro  %in% c("de coquimbo", "de valparaiso")~ "2.Center",
    region_del_centro  %in% c("del libertador general bernardo ohiggins", "del maule", "del bio-bio") ~ "3.South-center",
    region_del_centro  %in% c("de la araucania ", "de los rios", "de los lagos") ~ "4.South",
    region_del_centro  %in% c("de magallanes y la antartica chilena", "aysen") ~ "5.Austral",
    TRUE ~ "0.Metropolitan"  # En caso de que algún código no esté especificado
  ))|>
  #:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:  
  # Ana Karen research: pregnancy fields
  tidytable::mutate(
    pregnant = tidytable::case_when(
      is.na(se_trata_de_una_mujer_embarazada) ~ NA_character_,
      se_trata_de_una_mujer_embarazada == "no" ~ "no",
      se_trata_de_una_mujer_embarazada == "si" ~ "yes"
    ))|>
  tidytable::mutate(
    pregnant_disch = tidytable::case_when(
      is.na(ha_estado_embarazada_egreso) ~ NA_character_,
      ha_estado_embarazada_egreso == "no" ~ "no",
      ha_estado_embarazada_egreso == "si" ~ "yes"
    ))|> 
  #:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:  
  # Administrative discharge motive
  tidytable::mutate(
  adm_disch_reason = tidytable::case_when(
    motivo_de_egreso_alta_administrativa == "fallecimiento" ~ "death",
    motivo_de_egreso_alta_administrativa == "incumplimiento grave a las normas de convivencia del programa" ~ "rule_violation",
    motivo_de_egreso_alta_administrativa == "termino de convenio" ~ "agreement_end",
    motivo_de_egreso_alta_administrativa == "traslado de domicilio a zonas o lugares sin disponibilidad de oferta de tratamiento" ~ "no_local_service"
  ))

Discarding parole & missing plans, Entries: 174110

Discarding parole & missing plans, RUNs: 121448

After normalization, Entries: 174107

After normalization, RUNs: 121447

Code
#eliminate initial ";" with no OBS
SISTRAT23_c1_2010_2024_df_prev1g$OBS <- sub("^;\\s*", "", SISTRAT23_c1_2010_2024_df_prev1g$OBS)


4. Exploratory Probabilistic Deduplication

One of the main objectives of this stage of the project is to identify and separate each treatment for each user in a given admission and discharge date as a unique entity. This is shown in diagram of data preparation. The standardization of the age let us reduce a great amount of comparisons between every pair of records, making matching more feasible in terms of computational resources. For example, if we decided to compare each pair under consideration, we would have the number of cases (n= 174,107) multiplied by itself, leading to a total of 30,313,247,449 comparisons. To reach our objective, it was necessary to explore the principal causes that explain why or how a case matched with another. From duplicated cases, we knew how many records shared the same HASH and date of admission. But we needed to explore whether there would be other possible rules that would help to identify distinct treatments.

4.1 Perfect Duplicates of HASH and Date of Admission

Code
#create the duplicated dataset, following the recommendation to separate columns
duplicated_rows_concat <- data.frame(duplicated_HASH_date = duplicated(SISTRAT23_c1_2010_2024_df_prev1g[,c("hash_key","adm_date_rec")]), 
                                     row_dup_HASH_date = 1:nrow(SISTRAT23_c1_2010_2024_df_prev1g[,c("hash_key","adm_date_rec")])) #%>%

duplicated_rows_concat |>  dplyr::filter(duplicated_HASH_date==TRUE) |> nrow()

data.table::as.data.table(SISTRAT23_c1_2010_2024_df_prev1g)[, dup_hash_date := .N, by = c("hash_key","adm_date_rec_num")] |> ##dim() #arroja 117,190 casos únicos. PERO CUIDADO: EN LOS QUE TIENEN 2, 3, 4, 5 Y MÁS, HAY CASOS QUE SON ÚNICOS TAMBIÉN (POR ESO UN DISTINCT NO LOS CAPTURA)
  tidytable::group_by(dup_hash_date) |> 
  tidytable::summarise(n=n()) |> 
  tidytable::mutate(perc = round(n / sum(n),2)*100) |> 
  tidytable::mutate(perc = paste0(perc,"%")) |> 
  tidytable::mutate(Tot.Cases = n/dup_hash_date)|> 
  knitr::kable(format = "markdown", format.args = list(decimal.mark = ".", big.mark = ","),
                   caption="Times that the combination of HASH-Key & Date of Admission may appear in the dataset", align =rep('c', 4))

# Times that the combination of HASH-Key & Date of Admission may appear in the dataset
# dup_hash_date n   perc    Tot.Cases
# 1 174,107 100%    174,107
[1] 0
Times that the combination of HASH-Key & Date of Admission may appear in the dataset
dup_hash_date n perc Tot.Cases
1 174,107 100% 174,107

In Table 13, we can see that there were no cases with at least one occurrence of the same combination of HASH-Key and date of admission. As done in Section 1.c, we resolved issues of this sort.

Code
# 1) Build quarter label (e.g., "2021Q3")
SISTRAT23_c1_2010_2024_df_prev1g <- SISTRAT23_c1_2010_2024_df_prev1g |>
  tidytable::mutate(
    adm_quarter = tidytable::if_else(
      !is.na(adm_date_rec),
      paste0(lubridate::year(adm_date_rec), "Q", lubridate::quarter(adm_date_rec)),
      NA_character_
    )
  )

# 2) Row-wise duplicate indicator for (hash_key, adm_quarter)
duplicated_rows_concat_q <- data.frame(
  duplicated_HASH_quarter = duplicated(SISTRAT23_c1_2010_2024_df_prev1g[, c("hash_key","adm_quarter")]),
  row_dup_HASH_quarter    = seq_len(nrow(SISTRAT23_c1_2010_2024_df_prev1g))
)

# Count duplicate rows (beyond the first occurrence)
#sum(duplicated_rows_concat_q$duplicated_HASH_quarter)  # or:
# duplicated_rows_concat_q |> dplyr::filter(duplicated_HASH_quarter) |> nrow()
#2418

# 3) Distribution of how many times each (hash_key, adm_quarter) appears
data.table::as.data.table(SISTRAT23_c1_2010_2024_df_prev1g)[
  , dup_hash_quarter := .N, by = .(hash_key, adm_quarter)
][, .(n = .N), by = dup_hash_quarter
][order(dup_hash_quarter)
][, perc := paste0(round(n / sum(n), 2) * 100, "%")
][, Tot.Cases := n / dup_hash_quarter] |>
  knitr::kable(
    format = "markdown",
    format.args = list(decimal.mark = ".", big.mark = ","),
    caption = "Times that the combination of HASH-Key & Quarter of Admission may appear in the dataset",
    align = rep("c", 4)
  )

However, 3% of the sample had more than one treatment within the same quarter.

At this stage of the research, we needed to detect more complex patterns, in terms of cases with similar HASH-Key and date of admission. This approach is merely exploratory and aims to find cases with imperfect agreements on one or more of the variables. Once some variables are standardized, we would be able to use this approach to detect and replace values and erase duplicated cases.

  • We built 5-year blocks to limit the comparisons to similar birth years
  • We limited the self-pairing to cases where the row number value was lower than the candidate pair, to avoid duplicate/reversed pairs
  • The comparison is restricted also to admission dates <90 days apart
  • Drop admissions with missing admission dates
  • Then, we constructed a similarity score per pair:
    • +25 / −25 for hash_key match/mismatch
    • +25 / −25 for codigo_identificacion match/mismatch
    • +10 / +0 for sexo match/mismatch
    • +10 / +0 for id_centro match/mismatch
    • +5 / +0 for region_del_centro match/mismatch
    • +5 / +0 for ed_attainment match/mismatch
    • +5 / +0 for primary_sub match/mismatch
    • Also stores date_diff (absolute days between admissions).
  • Keeps scored pairs with score ≥ 10 (matches_scored_filtered) and removes the unfiltered scored table.
Code
SISTRAT23_c1_2010_2024_df_prev1g <- SISTRAT23_c1_2010_2024_df_prev1g|>
  tidytable::mutate(yr_block = base::floor(lubridate::year(SISTRAT23_c1_2010_2024_df_prev1g$birth_date_rec) / 5) * 5)

DT <- data.table::as.data.table(SISTRAT23_c1_2010_2024_df_prev1g)[
  !is.na(adm_date_rec),
  .(rn, hash_key, sexo, id_centro, region_del_centro,
    ed_attainment, primary_sub, first_sub_used,
    adm_date_rec, birth_date_rec)
]
DT[, yr_block := (as.integer(format(birth_date_rec, "%Y")) %/% 5L) * 5L]

# Build interval tables: X is a point [date,date]; Y is a window [date-90, date+90]
X <- data.table::copy(DT)[, `:=`(start = adm_date_rec, end = adm_date_rec)]
Y <- data.table::copy(DT)[, `:=`(start = adm_date_rec - 90L, end = adm_date_rec + 90L)]

# Keys for fast interval join
data.table::setkey(X, yr_block, start, end)
data.table::setkey(Y, yr_block, start, end)

# Overlap join: rows in X whose [start,end] fall within Y's [start,end] in same yr_block
pairs <- data.table::foverlaps(
  X, Y,
  by.x = c("yr_block","start","end"),
  by.y = c("yr_block","start","end"),
  nomatch = 0L, type = "within"
)[rn < i.rn]  # drop self/duplicate order

# Score and date diff
pairs[, date_diff := abs(as.integer(start - i.start))]
pairs[, score :=
  data.table::fifelse(hash_key == i.hash_key, 50L, -50L) +
  data.table::fifelse(sexo == i.sexo, 10L, 0L) +
  data.table::fifelse(id_centro == i.id_centro, 10L, 0L) +
  data.table::fifelse(region_del_centro == i.region_del_centro, 15L, 0L) +
  data.table::fifelse(ed_attainment == i.ed_attainment, 5L, 0L) +
  data.table::fifelse(primary_sub   == i.primary_sub,   5L, 0L) +
  data.table::fifelse(first_sub_used== i.first_sub_used,5L, 0L)
]

# Final projection + threshold
matches_scored_filtered <- pairs[score >= 10L, .(
  id_a = rn, id_b = i.rn,
  score, date_diff,
  a_hash_key = hash_key, b_hash_key = i.hash_key,
  a_sexo = sexo, b_sexo = i.sexo,
  a_id_centro = id_centro, b_id_centro = i.id_centro,
  a_adm_date_rec = start, b_adm_date_rec = i.start
)]

#remove  total matches
rm(DT); rm(X); rm(Y); rm(pairs)
#3490087

For scored matches, only 5574 had values over 10.

Code
matches_scored_filtered|> 
  rbind.data.frame(mutate(matches_scored_filtered, a_hash_key= b_hash_key))|> 
  group_by(a_hash_key)|> 
  summarise(n= n())|> 
    (\(df) {
        message(paste0("Total: ", length(df$n)/2))
        message(paste0("1st quartile: ", quantile(df$n,.25)))
        message(paste0("2nd quartile: ", quantile(df$n,.5)))
        message(paste0("3rd quartile: ", quantile(df$n,.75)))
        message(paste0("Percentile 97.5: ", round(quantile(df$n,.975),2)))
        df
    })()|> 
  pull(n)|> 
  hist(main= "Histogram of Scored Matches (>10 points)", xlab="Times present", breaks =20)

Total: 2392

1st quartile: 2

2nd quartile: 2

3rd quartile: 2

Percentile 97.5: 6

Cases with similar admission dates and same HASH

Cases with similar admission dates and same HASH

Code
# 1) Pick 10 matched pairs ONCE (reproducible), get both ids as a single vector
set.seed(2125)
idx <- matches_scored_filtered|>
  dplyr::filter(score>70)|>
  dplyr::slice_sample(n = 10)|>
  dplyr::select(id_a, id_b)|>
  tidyr::pivot_longer(dplyr::everything(), values_to = "rn")|>
  dplyr::pull(rn)|>
  unique()

# 2) Filter your big table to those rn's and select the 10 columns you want
out <- SISTRAT23_c1_2010_2024_df_prev1g|>
  dplyr::semi_join(tibble::tibble(rn = idx), by = "rn")|>
  dplyr::select(TABLE, hash_key, id_centro, tipo_de_plan, senda,
    adm_date_rec, dit_rec, disch_date, tr_compliance, 
    adm_disch_reason)|>
  dplyr::mutate(hash_key = as.integer(factor(hash_key)))

knitr::kable(
    out,
    format = "html",
    format.args = list(decimal.mark = ".", big.mark = ","),
    col.names = c("Date of retrieval","HASH","Center ID","Plan type","SENDA",
        "Admission date","Days in treatment","Discharge date",
        "Cause of discharge","Motive of adm. discharge"),
    align = rep("c", ncol(out)),   # <- key fix
    caption = "Table 15. Example of probabilistic matches (example with 10 HASHs)")|>
    kableExtra::scroll_box(width = "100%", height = "350px")
Table 15. Example of probabilistic matches (example with 10 HASHs)
Date of retrieval HASH Center ID Plan type SENDA Admission date Days in treatment Discharge date Cause of discharge Motive of adm. discharge
2024 1 682 pg-pai si 2024-04-10 51 2024-05-31 referral
2024 1 682 pg-pr si 2024-06-17 30 2024-07-17 referral
2018 2 176 pg-pai si 2018-05-02 58 2018-06-29 referral
2018 2 179 pg-pr si 2018-07-04 50 2018-08-23 early dropout
2021 3 291 pg-pai si 2021-02-26 53 2021-04-20 referral
2022 3 644 pg-pr si 2021-04-21 352 2022-04-08 completion
2024 4 443 pg-pai si 2023-12-01 87 2024-02-26 referral
2024 4 411 m-pr si 2024-02-27 36 2024-04-03 early dropout
2016 5 133 pg-pr si 2016-03-07 85 2016-05-31 referral
2016 5 133 m-pai si 2016-06-03 349 2017-05-18 completion
2019 6 202 pg-pai si 2019-02-08 52 2019-04-01 referral
2019 6 489 pg-pab si 2019-04-09 675 2021-02-12 completion
2017 7 266 pg-pr si 2017-06-07 29 2017-07-06 referral
2018 7 248 pg-pai si 2017-07-25 190 2018-01-31 completion
2020 8 176 pg-pai si 2019-12-19 60 2020-02-17 referral
2020 8 179 pg-pr si 2020-02-18 338 2021-01-21 late dropout
2017 9 290 pg-pai si 2017-03-28 50 2017-05-17 referral
2017 9 303 pg-pr si 2017-05-18 12 2017-05-30 referral
2015 10 119 pg-pai si 2015-06-01 60 2015-07-31 referral
2016 10 117 pg-pr si 2015-08-05 170 2016-01-22 late dropout

In the example, we can notice that some cases shared center IDs, some had not a strict overlapped dates, and some had different reasons for discharge, suggesting a continuity in treatment episodes. This approach is useful to identify cases that may have been duplicated due to errors in the data entry. We will use this approach to identify and correct duplicated cases in the next stage of the project.


4.2 Overlappings

An analysis of duplicated events showed that many ranges between the dates of admission and discharge were overlapping due to referrals to other centers, principally by changes in the treatment center. However, to identify overlappings in treatments, it is necessary to obtain from other sources the missing dates as much as possible and clean the dates that may be incorrectly formatted.

Code
CONS_C1_df_dup_intervals <- SISTRAT23_c1_2010_2024_df_prev1g |>
  rename(hash_key_2 = hash_key, row2 = rn) |>
  select(
    row2, hash_key_2, codigo_identificacion, TABLE_rec, adm_date_rec, disch_date, adm_date_rec_num, disch_date_num,
    adm_age_rec, nombre_centro_rec, tr_compliance, senda
  ) |>
  filter(tr_compliance != "referral") |>
  data.table::as.data.table()

# Find overlaps efficiently with duckdb
overlap_dates_C1 <- janitor::clean_names(sqldf::sqldf("
  SELECT * -- no me interesa sólo de x, sino de ambos
  FROM CONS_C1_df_dup_intervals x
  INNER JOIN CONS_C1_df_dup_intervals y
    ON x.hash_key_2 = y.hash_key_2
    AND x.row2 < y.row2  -- Prevents duplicate pairs
    AND x.adm_date_rec_num < y.disch_date_num -- Admitted before being admitted into another treatment
    AND x.disch_date_num > y.adm_date_rec_num -- Discharge after being admitted in other
"))

#arrange(overlap_dates_C1, hash_key_2, adm_date_rec_num, row2)

if((group_by(overlap_dates_C1 , hash_key_2) |> mutate(n=n()) |> filter(n>1) |> nrow())>0){
  warning("There are HASHs with more than one overlapping")
} else(message("HASHs with overlapping dates do not repeat themselves"))

#nrow(overlap_dates_C1) # pasamos de 1062 a
#394

# Plot
# overlap_dates_C1 adm_date_rec, disch_date
arrange(overlap_dates_C1, hash_key_2, adm_date_rec_num, row2) |> 
ggplot() +
  geom_segment(
    aes(y = as.POSIXct(adm_date_rec), yend = as.POSIXct(disch_date),
        x = hash_key_2, xend = hash_key_2), color="blue", alpha=.5, linewidth=1.3
  ) +
    geom_segment(
    aes(y = as.POSIXct(adm_date_rec_2), yend = as.POSIXct(disch_date_2),
        x = hash_key_2, xend = hash_key_2), color="red", alpha=.5, linewidth=1.3
  ) +
  coord_flip()+
  # scale_x_datetime(
  #   breaks = scales::date_breaks("1 year"),
  #   limits = as.POSIXct(c('2000-01-01 09:00:00', '2023-01-01 09:00:00')),
  #   labels = scales::date_format("%m/%y")
  # ) +
  theme_minimal(base_size = 12) +
  theme(
    axis.text.y = element_blank(),
    axis.ticks.y = element_blank(),
    legend.position = "none",
    panel.grid = element_blank(),
    plot.caption = element_text(hjust = 0, face = "italic")
  ) +
  labs(
    x = "Dates of admission and discharge", 
    y = "",
    caption = "Note. Only users that share characteristics and overlap between them"
  )
Figure. Trajectories of HASHs from dates of admission to discharge

Figure. Trajectories of HASHs from dates of admission to discharge

Figure 6.1 shows 401 record pairs that share the same HASH Key, but the date of admission is less than the date of discharge of another entry in the dataset, and the date of discharge is greater than the date of admission of that other case. It does not include derivation as a cause of the discharge. These conditions let us see how many cases overlap with another entry in the dataset. This graphic may seem a bit noisy because it covers all the overlapped cases, but we should look less at the black colored regions and more at the white areas between the lines to get an idea of the years that accumulate more overlappings.


4.3 Missing Dates of Discharge

Is not possible to report Table 16 because there is no treatment dates in this database.

Code
disch_date_more_three_years<- 
SISTRAT23_c1_2010_2024_df_prev1g|>
    tidytable::filter(is.na(disch_date))|>
    tidytable::mutate(dit_trans= lubridate::time_length(interval(adm_date_rec, as.Date("2025-05-28")),unit= "days"), 
                      diff_treat_days= abs(dit_rec- dit_trans))|> #fecha del día de hoy 
    tidytable::select(hash_key, codigo_identificacion, TABLE, sexo,  adm_date_rec, disch_date, adm_date_rec_num, disch_date_num, dit_trans,dit_rec,diff_treat_days)|>
    filter(dit_trans>1095) |> 
    tidytable::group_by(TABLE)|>
    tidytable::summarize(
        n = n())

SISTRAT23_c1_2010_2024_df_prev1g|>
    tidytable::filter(is.na(disch_date))|>
    tidytable::mutate(dit_trans= lubridate::time_length(interval(adm_date_rec, as.Date("2025-05-28")),unit= "days"), 
         diff_treat_days= abs(dit_rec- dit_trans))|> #fecha del día de hoy 
    tidytable::select(hash_key, codigo_identificacion, TABLE, sexo,  adm_date_rec, disch_date, adm_date_rec_num, disch_date_num, dit_trans,dit_rec,diff_treat_days)|>
    tidytable::group_by(TABLE)|>
   tidytable::summarize(
      n = n()
  )|> 
  left_join(disch_date_more_three_years, by="TABLE")|> 
    knitr::kable(format = "markdown", format.args = list(decimal.mark = ".", big.mark = ","),
                 caption="Missing Date of Discharge by Database year",  align =rep('c', 2),
                 col.names= c("Retrieval year", "Missing disch. dates", "Missing disch.dates, treatment >over 3yrs"))
Missing Date of Discharge by Database year
Retrieval year Missing disch. dates Missing disch.dates, treatment >over 3yrs
2010 17 17
2011 68 68
2012 11 11
2013 23 23
2014 16 16
2015 30 30
2016 55 55
2017 68 68
2018 52 52
2019 43 43
2020 39 39
2021 41 41
2022 108 68
2023 138 23
2024 2,924 21

Much of the databases of 2024 contain missing dates of discharge. This is a problem because it is necessary to know the date of discharge to calculate the days of treatment. This may be due to the fact that the database is right-truncated. What is more intriguing is the amount of missing discharge dates in 2022. We will need to review records with the same or similar admission dates and obtain the real missing discharge dates. Of the total treatment records, 575 (0.33%) records are over 3 years of duration, hence, had missing and not right-truncated discharge dates.

Code
miss_disch_dates<-
     tidytable::filter(SISTRAT23_c1_2010_2024_df_prev1g, is.na(disch_date))|> 
    (\(df) {
        message(paste0("Missing discharge dates, Total: ", nrow(df)))
        message(paste0("Missing discharge dates, Number of HASHs: ", tidytable::distinct(df, hash_key) |> nrow()))
        df
    })()  

Missing discharge dates, Total: 3633

Missing discharge dates, Number of HASHs: 3628

Code
# Missing discharge dates, Total: 3659
# Missing discharge dates, Number of HASHs: 3654

As stated in the meeting of Jan. 13, 2020, an alternative would be to impute days of treatment and generate a new date of discharge by adding the days of treatment to the date of admission. We may look whether these cases have what is necessary to impute may have overlapping dates with other subsequent admissions.


5. Preliminary Summary in August 2025

Many selections for the purposes of the study are still being necessary until today, in order to keep the greater amount of information about each event.

Code
#knitr::include_graphics("G:/Mi unidad/Alvacast/SISTRAT 2019 (github)/SUD_CL/Figures/Figure_Duplicates.svg")
DiagrammeR::grViz("
digraph graph2 {

graph [layout = dot]

# node definitions with substituted label text
node [shape = rectangle, width = 4, color = 'steelblue',fillcolor = lightblue]
a [label = '@@1']
b [label = '@@2']
c [label = '@@3']
d [label = '@@4']
e [label = '@@5']
f [label = '@@6']
g [label = '@@7']

a -> b -> c -> {d e f g}

}

[1]:  paste0('Once removed same values in >100 variables (n = ', formatC(nrow(SISTRAT23_c1_2010_2024_df2), format='f', big.mark=',', digits=0), ')')
[2]: paste0('Once removed same values in variables related to treatments and substance use (n = ', formatC(nrow(SISTRAT23_c1_2010_2024_df_prev1b), format='f', big.mark=',', digits=0), ')')
[3]: paste0('Preliminary Dataset (n = ', formatC(nrow(SISTRAT23_c1_2010_2024_df_prev1g), format='f', big.mark=',', digits=0), ')')
[4]: paste0('Same HASH &\\nDate of Admission (n = ', formatC(0, format='f', big.mark=',', digits=0), ')')
[5]: paste0('Overlapped Ranges\\nof Treatments (n = ', formatC(nrow(overlap_dates_C1), format='f', big.mark=',', digits=0), ')')
[6]: paste0('Pair of probabilistic scored matches (>10 points) &\\nnon-scored matches of RUN &\\n|10-day| difference in admission dates (n = ', formatC(nrow(matches_scored_filtered), format='f', big.mark=',', digits=0), ')')
[7]: paste0('Missing discharge dates \\n w/possible overlapping in subsequent\\ntreatments (n = ', formatC(nrow(tidytable::filter(SISTRAT23_c1_2010_2024_df_prev1g, is.na(disch_date))), format='f', big.mark=',', digits=0), ')')
")#
Code
#add c46caa3cd2c89a2222ce319cf6f5e98392f928e0544ee*******************
#add discarded from invalid cases such as portilla
#do not add birth date corrections


To close the project, we erase polars objects.

Code
rm(list = ls()[grepl("_pl$", ls())])

Session info

Code
#|echo: true
#|error: true
#|message: true
#|paged.print: true
message(paste0("R library: ", Sys.getenv("R_LIBS_USER")))

R library: G:/My Drive/Alvacast/SISTRAT 2023/renv/library/windows/R-4.4/x86_64-w64-mingw32

Code
message(paste0("Date: ",withr::with_locale(new = c('LC_TIME' = 'C'), code =Sys.time())))

Date: 2025-09-27 13:57:04.737026

Code
message(paste0("Editor context: ", path))

Editor context: G:/My Drive/Alvacast/SISTRAT 2023/cons

Code
cat("quarto version: "); quarto::quarto_version()
quarto version: 
[1] '1.7.29'
Code
sesion_info <- devtools::session_info()

Warning in system2(“quarto”, “-V”, stdout = TRUE, env = paste0(“TMPDIR=”, : el comando ejecutado ‘“quarto” TMPDIR=C:/Users/andre/AppData/Local/Temp/RtmpgnsPMu/file92cc223746c -V’ tiene el estatus 1

Code
dplyr::select(
  tibble::as_tibble(sesion_info$packages),
  c(package, loadedversion, source)
) %>% 
  DT::datatable(filter = 'top', colnames = c('Row number' =1,'Package' = 2, 'Version'= 3),
              caption = htmltools::tags$caption(
        style = 'caption-side: top; text-align: left;',
        '', htmltools::em('R packages')),
      options=list(
initComplete = htmlwidgets::JS(
        "function(settings, json) {",
        "$(this.api().tables().body()).css({
            'font-family': 'Helvetica Neue',
            'font-size': '70%', 
            'code-inline-font-size': '15%', 
            'white-space': 'nowrap',
            'line-height': '0.75em',
            'min-height': '0.5em'
            });",
        "}")))
Code
#|echo: true
#|error: true
#|message: true
#|paged.print: true
#|class-output: center-table

reticulate::py_list_packages() %>% 
  DT::datatable(filter = 'top', colnames = c('Row number' =1,'Package' = 2, 'Version'= 3),
              caption = htmltools::tags$caption(
        style = 'caption-side: top; text-align: left;',
        '', htmltools::em('Python packages')),
      options=list(
initComplete = htmlwidgets::JS(
        "function(settings, json) {",
        "$(this.api().tables().body()).css({
            'font-family': 'Helvetica Neue',
            'font-size': '70%', 
            'code-inline-font-size': '15%', 
            'white-space': 'nowrap',
            'line-height': '0.75em',
            'min-height': '0.5em'
            });",
        "}"))) 

Warning in system2(python, args, stdout = TRUE): el comando ejecutado ‘“G:/My Drive/Alvacast/SISTRAT 2023/.mamba_root/envs/py311/python.exe” -m pip freeze’ tiene el estatus 1

Save

Code
wdpath<-
paste0(gsub("/cons","",gsub("cons","",paste0(getwd(),"/cons"))))
envpath<- if(regmatches(wdpath, regexpr("[A-Za-z]+", wdpath))=="G"){"G:/Mi unidad/Alvacast/SISTRAT 2023/"}else{"E:/Mi unidad/Alvacast/SISTRAT 2023/"}

paste0(getwd(),"/cons")
file.path(paste0(wdpath,"data/20241015_out"))
file.path(paste0(envpath,"data/20241015_out"))

# Save
rdata_path <- file.path(wdpath, "data/20241015_out", paste0("23_ndp_", format(Sys.time(), "%Y_%m_%d"), ".Rdata"))

save.image(rdata_path)
cat("Saved in:",
    rdata_path)

#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
if (Sys.getenv("RSTUDIO_SESSION_TYPE") == "server" || file.exists("/.dockerenv")) {
  password <- Sys.getenv("PASSWORD_SECRET")
} else {
  if (interactive()) {
    utils::savehistory(tempfile())
    Sys.setenv(PASSWORD_SECRET = readLines(paste0(wdpath, "secret.txt"), warn = FALSE))
    utils::loadhistory()
  }
  Sys.setenv(PASSWORD_SECRET = readLines(paste0(wdpath, "secret.txt"), warn = FALSE))
}

#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
save.image(paste0(rdata_path,".enc"))

# Encriptar el archivo en el mismo lugar
httr2::secret_encrypt_file(path = paste0(rdata_path,".enc"), key = "PASSWORD_SECRET")

#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
cat("Copy renv lock into cons folder\n")

if (Sys.getenv("RSTUDIO_SESSION_TYPE") == "server" || file.exists("/.dockerenv")) {
  message("Running on RStudio Server or inside Docker. Folder copy skipped.")

} else {
    
  source_folder <- 
  destination_folder <- paste0(wdpath,"cons/renv")
  
  # Copy the folder recursively
    file.copy(paste0(wdpath,"renv.lock"), paste0(wdpath,"cons/renv.lock"), overwrite = TRUE)
  
  message("Renv lock copy performed.")
}

Renv lock copy performed.

Code
#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:#:
time_after_dedup1<-Sys.time()

paste0("Time in markdown: ");time_after_dedup1-time_before_dedup1
[1] "G:/My Drive/Alvacast/SISTRAT 2023/cons/cons"
[1] "G:/My Drive/Alvacast/SISTRAT 2023//data/20241015_out"
[1] "G:/Mi unidad/Alvacast/SISTRAT 2023/data/20241015_out"
Saved in: G:/My Drive/Alvacast/SISTRAT 2023///data/20241015_out/23_ndp_2025_09_27.RdataCopy renv lock into cons folder
[1] "Time in markdown: "
Time difference of 20.35914 mins
Back to top